Steps of my analysis

Load all relevant packages:

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.2     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'zoo'
## 
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Loading required package: usethis
## 
## Loading required package: spData
## 
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## 
## Loading required package: sf
## 
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: foreach
## 
## 
## Attaching package: 'foreach'
## 
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## 
## Loading required package: parallel
## 
## This is INLA_22.12.16 built 2022-12-23 13:24:10 UTC.
##  - See www.r-inla.org/contact-us for how to get help.
## 
## 
## Attaching package: 'reshape2'
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
## 
## 
## 
## Attaching package: 'jsonlite'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
## 
## 
## Use the function
## RAQSAPI::aqs_credentials(username, key)
## before using other RAQSAPI functions
## See ?RAQSAPI::aqs_credentials for more information
## 
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
## 
## corrplot 0.92 loaded
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Loading and (quickly cleaning) all necessary datasets:

###SoA data

soa.data = read_xlsx("SoA.data.1019.xlsx")

###county_flips are unique identifier for counties
soa.data$county_fips = as.character(soa.data$county_fips) ##change it to character

#IMPORTANT

# This shape file contains the coordinates for county boundaries
##counties is from urbanmap

CA.counties = urbnmapr::counties %>% filter(state_abbv == "CA")

###IF WE WANT TO BOIL DOWN TIME SERIES AND KEEP ALL DATA, SWITCH to CA_newdata below
soa_joint <- left_join(CA.counties, soa.data, by = "county_fips")
## Warning in left_join(CA.counties, soa.data, by = "county_fips"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#Use with soa.data (full data)
CA_data = soa_joint %>% select(long, lat, county_name.y, Year, Score, Total_Pop,
                               EDUC_Lessthan9, EDUC_college, White_Collar,
                               Unemployment_Rate, Adj_HH_income, Income_Disparity,
                               Individuals_Below_Poverty, Median_Home_Value,
                               Median_Gross_Rent, Housing_No_Telephone,
                               Housing_Incomplete_Plumbing)

### Quick descriptions of SoA data variables
# Score: social deprivation index (SDI) score calculated from the following 11 subindices
# EDUC_Lessthan9: % of population older than 24 with less than 9 years of education
# EDUC_college: % of population older than 24 with at least four years of college education
# White_Collar: % of population older than 15 employed in a white collar occupation
# Unemployment_Rate: unemployment rate for population older than 15
# Adj_HH_income: median household income adjusted for local housing costs
# Income_Disparity: an income disparity ratio
# Individuals_Below_Poverty: % of population below the federal poverty line
# Median_Home_Value: median home value for owned, occupied units
# Median_Gross_Rent: median gross rent for rented units
# Housing_No_Telephone: % of households without a telephone
# Housing_Incomplete_Plumbing: % of households with incomplete plumbing

colnames(CA_data)[3] = "County"

CA_newdata = soa.data[1:58,]
CA_newdata = CA_newdata[,-c(4,7,8)]


###Cal-ViDa data
mortality = read.csv("respmortality1419.csv") #data from 2014-2019 bc we want to avoid COVID pandemic era
mortality = filter(mortality,Cause_of_Death %in% c("Chronic lower respiratory diseases","Influenza and pneumonia"))
mortality = mortality[,-c(1,4,9)]
Population = rep(100000,nrow(mortality))
mortality = cbind(mortality,Population)

Heatmap of population by county

#Initializing map and station locations
ca_map <- map_data("county", region = "california")

#Match population dataset with ca_map
#2010-2019 population data for CA 
USpops = read.csv("CA_census_pops1019.csv")
CApops = USpops %>% filter(STNAME == "California") %>% select(CTYNAME,POPESTIMATE2019)
CApops = CApops[-1,]

CApops$CTYNAME = unique(ca_map$subregion)
colnames(CApops) = c("subregion","pop")

merged_data <- merge(ca_map, CApops, by = "subregion", all.x = TRUE)

#Plot
gg_pop <- ggplot() +
  geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = pop), 
               color = "black") +
  coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
  theme_void() +
  labs(title = "Heatmap of County Populations for 2019",fill = expression("Population")) +
  scale_fill_gradient(low = "yellow", high = "red")

print(gg_pop)

California state with county labels for reference:

SKATER clustering

The code below structures the dataframe to be fed into the spatial data frame (SPDF) object. Dimensions are 58 rows by 10 columns (each column is its own year)

###Setting up SPDF for CA counties 
CA_sf = st_read(getwd(),"CA_Counties_TIGER2016")
## Reading layer `CA_Counties_TIGER2016' from data source 
##   `C:\Users\jeffr\Desktop\Spatiotemporal + Causal Inference\Wildfire Paper 1 Code' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -13857270 ymin: 3832931 xmax: -12705030 ymax: 5162404
## Projected CRS: WGS 84 / Pseudo-Mercator
CA_spdf = as_Spatial(CA_sf)

# score_scaled = scale(CA_data$Score)

c_index = unique(CA_data$County)
y_index = unique(CA_data$Year)
SDI_df = matrix(nrow=10,ncol=58)
track1 = 1
track2 = 1

for (i in c_index){
  for (j in y_index){
    scores = CA_data %>% filter(County == i) %>% filter(Year == j) %>% select(Score) %>% unique()
    SDI_df[track1,track2] = scores$Score
    track1 = track1 + 1
  }
  track1 = 1
  track2 = track2 + 1
}

score_scaled = scale(SDI_df) #NEED TO SCALE THE DATA BEFORE FEEDING IT INTO SKATER

#covariates_scale = data.frame(apply(CA_data[,4:16],2,scale))
covariates_scale = data.frame(t(score_scaled))

CA_spdf@data = covariates_scale

Using the SPDF from above, we follow the steps of SKATER tutorial (https://www.dshkol.com/post/spatially-constrained-clustering-and-regionalization/) to generate three separate clustering results: (1) Unconstrained/default (2) Clusters have minimum population constraint based on the total population / # of clusters (3) Clusters are comprised of a minimum number of counties (8 for smaller number of clusters, 4 for bigger numbers)

#Identify neighborhood list for counties 
CA_nb = poly2nb(CA_spdf)

#summary(CA_nb)

# plot(CA_spdf, main = "With queen")
# plot(CA_nb, coords = coordinates(CA_spdf), col="blue", add = TRUE)

#Calculate edge costs (dissimilarity matrix) based on Euclidean distance 
costs <- nbcosts(CA_nb, data = covariates_scale)

###Get adjacency matrix using nb2mat() (SEPARATE STEP FOR INLA)
adj = nb2mat(CA_nb,style = "B")

#Style means the coding scheme style used to create the weighting matrix 
# B: basic binary coding scheme
# W: row standardized coding scheme 
# C: globally standardized coding scheme  
# U: values of C / number of neighbors 
# S: variance stabilizing coding scheme 

#Transform edge costs to spatial weights 
ct_w <- nb2listw(CA_nb,costs,style="B")

#Create minimum spanning tree 
ct_mst <- mstree(ct_w)

plot(ct_mst,coordinates(CA_spdf),col="blue", cex.lab=0.5)
plot(CA_spdf, add=TRUE)

#Run SKATER algorithm to get 7 contiguous clusters (cluster idx is in order of CA_sf)
clus7 <- skater(edges = ct_mst[,1:2], data = covariates_scale, ncuts = 6)

#Determine an appropriate minimum population threshold based on???
pops_summary = summary(unique(CA_data$Total_Pop))
pops_summary
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    12700    63275   219705   710796   750235 10105722
#Idea 1: Use median * (how many counties should be in a cluster at minimum)
min_pop = as.numeric(pops_summary[3] * 4)

#Idea 2: If we assume CA population is 39M, divide total pop by # clusters
min_pop2 = 39000000 / 7

#Add a min population constraint
clus7_min <- skater(edges = ct_mst[,1:2], 
                     data = covariates_scale, 
                     crit = 5000000, 
                     vec.crit = CA_data$Total_Pop,
                     ncuts = 6)

#Add a minimum number of areas in each cluster constraint 
clus7_minarea = skater(edges = ct_mst[,1:2], data = covariates_scale, ncuts = 6, 4)


CA_data_cluster = (CA_sf %>% mutate(clus = clus7_minarea$groups))

#Plot clustered CA
plot((CA_sf %>% mutate(clus = clus7$groups))['clus'], main = "7 cluster example")

plot((CA_sf %>% mutate(clus = clus7_min$groups))['clus'], main = "7 cluster example with population constraint")

plot((CA_sf %>% mutate(clus = clus7_minarea$groups))['clus'], main = "7 cluster example with minimum number of areas constraint")

#plot(CA_sf,col=c("red","green","blue","purple","yellow")[clus7_min$groups],max.plot=17)

For reference, here are the cluster labels for each county:

clusterlabels = data.frame(CA_data_cluster$NAME,clus7_minarea$groups)
names(clusterlabels) = c("counties","Cluster")

o = order(clusterlabels$counties)
clusterlabels = clusterlabels[o,]
rownames(clusterlabels) = NULL

clusterlabels
##           counties Cluster
## 1          Alameda       4
## 2           Alpine       5
## 3           Amador       5
## 4            Butte       1
## 5        Calaveras       5
## 6           Colusa       6
## 7     Contra Costa       5
## 8        Del Norte       1
## 9        El Dorado       6
## 10          Fresno       3
## 11           Glenn       6
## 12        Humboldt       1
## 13        Imperial       7
## 14            Inyo       4
## 15            Kern       4
## 16           Kings       4
## 17            Lake       5
## 18          Lassen       2
## 19     Los Angeles       7
## 20          Madera       3
## 21           Marin       5
## 22        Mariposa       5
## 23       Mendocino       1
## 24          Merced       5
## 25           Modoc       2
## 26            Mono       3
## 27        Monterey       4
## 28            Napa       5
## 29          Nevada       6
## 30          Orange       7
## 31          Placer       6
## 32          Plumas       2
## 33       Riverside       7
## 34      Sacramento       5
## 35      San Benito       3
## 36  San Bernardino       4
## 37       San Diego       7
## 38   San Francisco       5
## 39     San Joaquin       4
## 40 San Luis Obispo       4
## 41       San Mateo       4
## 42   Santa Barbara       4
## 43     Santa Clara       4
## 44      Santa Cruz       4
## 45          Shasta       1
## 46          Sierra       6
## 47        Siskiyou       2
## 48          Solano       5
## 49          Sonoma       1
## 50      Stanislaus       5
## 51          Sutter       6
## 52          Tehama       1
## 53         Trinity       2
## 54          Tulare       4
## 55        Tuolumne       5
## 56         Ventura       4
## 57            Yolo       6
## 58            Yuba       2
counties = clusterlabels$counties
num_clus = max(clus7_minarea$groups)

HUGE graph estimation

This code chunk takes the cluster grouping from SKATER and aggregates the full dataframe from the SPDF (58x10) into a 10x8 matrix (10 time points x 8 clusters) to be fed into the graph estimation package HUGE. The data from each county in a given cluster is aggregated based on a population weighted mean.

HUGE uses glasso to estimate a graph structure based on the aggregated feature data which recall, is the SDI score (socioeconomic status) from the SoA. We use a grid of lambda values under 1 in order to ensure that some edges will be present in the estimates produced by HUGE. This decision is supported by the fact that partial correlations calculated via regression appear to be statistically significant. Based on simulation results, we believe that EBIC is a suitable criterion for choosing the best estimated graph in the huge.select() step.

The objective function for EBIC is as follows: \(EBIC_{\gamma}(s) = -2log L_n \{ \hat \theta(s) \} + \nu(s) log n + 2 \gamma log \tau(S_j)\) where \(0 \leq \gamma \leq 1\) and \(S_j\) is model space of size \(\tau(S_j)\)

#Aggregate feature vectors into one vector for each SKATER cluster
CA_cluster = data.frame(CA_sf$NAMELSAD,clus7_minarea$groups)
names(CA_cluster) = c("County","Cluster")
year = 2010:2019

CA_cluster = left_join(CA_cluster,CA_data,by = "County")

#Get weighted avg value for Score for each cluster for each year 
#Create new data matrix of aggregated feature vectors 
cluster_features = matrix(NA,nrow = 10,ncol = 7)

for (i in 1:7){
  cluster = CA_cluster %>% filter(Cluster == i)

  for(j in 1:10){
    #Obtain a weighted mean based on population
    vec = cluster %>% filter(Year == year[j]) %>% select(Score,Total_Pop) %>% unique()
    cluster.pop = sum(vec$Total_Pop)
    cluster.popweights = vec$Total_Pop/cluster.pop
    cluster_features[j,i] = weighted.mean(vec$Score,cluster.popweights)
  }
}

#Graph learning w HUGE
out.glasso = huge(cluster_features,lambda = seq(0.95,0.05,by=-0.05),method="glasso")
## 
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 5%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 10%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 15%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 21%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 26%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 31%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 36%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 42%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 47%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 52%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 57%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 63%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 68%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 73%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 78%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 84%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 89%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 94%
Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 100%
## Conducting the graphical lasso (glasso)....done.                                          
glasso.stars = huge.select(out.glasso,criterion = "stars",stars.thresh = 0.1)
## Conducting Subsampling....in progress:5% 
Conducting Subsampling....in progress:10% 
Conducting Subsampling....in progress:15% 
Conducting Subsampling....in progress:20% 
Conducting Subsampling....in progress:25% 
Conducting Subsampling....in progress:30% 
Conducting Subsampling....in progress:35% 
Conducting Subsampling....in progress:40% 
Conducting Subsampling....in progress:45% 
Conducting Subsampling....in progress:50% 
Conducting Subsampling....in progress:55% 
Conducting Subsampling....in progress:60% 
Conducting Subsampling....in progress:65% 
Conducting Subsampling....in progress:70% 
Conducting Subsampling....in progress:75% 
Conducting Subsampling....in progress:80% 
Conducting Subsampling....in progress:85% 
Conducting Subsampling....in progress:90% 
Conducting Subsampling....in progress:95% 
Conducting Subsampling....in progress:100% 
Conducting Subsampling....done.                  
glasso.ric = huge.select(out.glasso,criterion = "ric")
## Conducting rotation information criterion (ric) selection....done
## Computing the optimal graph....done
glasso.ebic = huge.select(out.glasso,criterion = "ebic")
## Conducting extended Bayesian information criterion (ebic) selection....done
plot(glasso.stars)

plot(glasso.ric)

plot(glasso.ebic)

huge.est = glasso.ebic$refit
huge.est
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]    0    1    1    0    1    1    1
## [2,]    1    0    1    0    0    1    1
## [3,]    1    1    0    1    0    1    0
## [4,]    0    0    1    0    1    1    1
## [5,]    1    0    0    1    0    0    1
## [6,]    1    1    1    1    0    0    1
## [7,]    1    1    0    1    1    1    0
degree_connectivity = data.frame(colSums(huge.est))
colnames(degree_connectivity) = "node_connections"
degree_connectivity = cbind(c(1:num_clus),degree_connectivity)

Plotting HUGE graph

install.packages("igraph")
library(igraph)
# Convert the adjacency matrix to a graph object
g <- graph_from_adjacency_matrix(huge.est, mode = "undirected")

# Assign custom labels to vertices
V(g)$name <- c(1,2,3,4,5,6,7)

# Assign colors to vertices
V(g)$color <- c("red", "cyan", "green", "yellow", "purple", "orange", "pink")

# Plot the graph with labeled vertices
plot(g, vertex.label = V(g)$name, vertex.color = V(g)$color, vertex.size = 20)

Transforming estimated adjacency matrix to graph filter H

The code below takes the adjacency matrix estimated in the previous step and transforms it into a graph filter H. The steps are explained in Antonian et al (Gareth’s paper). The cutoff transformation is used for the eigenvalues.

A = as.matrix(huge.est)
p = nrow(A)

#obtain graph Laplacian L
D = diag(p)
for (i in 1:p){
  d = sum(A[,i])
  D[i,i] = d
}

L = D - A

#eigendecomposition of L
Ldecomp = eigen(L)
U = as.matrix(Ldecomp$vectors)
Lambdas = Ldecomp$values

#test
#U %*% (diag(p)*Lambdas) %*% t(U)

#Function implementing cutoff tranform for eigenvalues 
cutoff.transform = function(lambdas,q){
  transformed = c()
  cutoff = quantile(lambdas,q)
  for (i in lambdas){
    if(i <= cutoff){
      transformed = c(transformed,1)
    }
    else{
      transformed = c(transformed,0)
    }
  }
  
  return(transformed)
}

#quantile(Lambdas,2/3)
transformed.L = cutoff.transform(Lambdas,2/3)
eta.L = diag(p)*transformed.L

#obtain graph filter
H = U %*% eta.L %*% t(U)
H
##             [,1]         [,2]         [,3]        [,4]         [,5]        [,6]
## [1,]  0.58851806 -0.068010981  0.163559807 -0.28070511  0.150878170  0.29315326
## [2,] -0.06801098  0.956168924  0.086945750 -0.06181596 -0.008836559 -0.05532934
## [3,]  0.16355981  0.086945750  0.824846850  0.13992515  0.002116461  0.07426411
## [4,] -0.28070511 -0.061815959  0.139925150  0.80121219  0.086945750  0.15087817
## [5,]  0.15087817 -0.008836559  0.002116461  0.08694575  0.909676139 -0.21504407
## [6,]  0.29315326 -0.055329344  0.074264113  0.15087817 -0.215044073  0.46065322
## [7,]  0.15260679  0.150878170 -0.291658130  0.16355981  0.074264113  0.29142465
##             [,7]
## [1,]  0.15260679
## [2,]  0.15087817
## [3,] -0.29165813
## [4,]  0.16355981
## [5,]  0.07426411
## [6,]  0.29142465
## [7,]  0.45892461
gfilter_weight = norm((1/7)*H^2,type = "F")
# gfilter_weight = norm(H^2,type = "F")

Create a function to generate heatmap plots of matrices

matrix_heatmap= function(matrix,title = "",gradient_zones = c(0,0.8,0.999)){
  r = nrow(matrix)
  df = as_tibble(cbind(expand.grid(rev(seq_len(r)),seq_len(r)),c(matrix))) %>% setNames(c("row","col","value"))
  df$value[df$value == 1] = 0.999
  
  plot = ggplot(df,mapping = aes(x=row,y=col,fill=value)) + geom_tile() + 
    scale_fill_gradientn(colors = c("yellow","orange","red"),
                       values = rescale(gradient_zones),
                       limits = c(0, 0.99),
                       oob = squish) + ggtitle(title)
  return(plot)
}

Heatmap plot of graph filter

#Heatmap of resulting H 
corrplot(H, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
         title = "Graph filter")

matrix_heatmap(H,title = "")

Downloading EPA data

library(tidyverse)
library(plyr)
## Warning: package 'plyr' was built under R version 4.2.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(dplyr)
library(jsonlite)
library(lubridate)
library(ggplot2)
library(maps)
## Warning: package 'maps' was built under R version 4.2.3
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)
## Warning: package 'mapdata' was built under R version 4.2.3
library(geosphere)
library(urbnmapr)
library(RAQSAPI)
library(con2aqi)

aqs_credentials("jeffreywu@ucsb.edu","copperheron86")

Get county and pollutant reference codes from EPA

#Get county codes 
counties_url = "https://aqs.epa.gov/data/api/list/countiesByState?email=jeffreywu@ucsb.edu&key=copperheron86&state=06&"

countycodes = fromJSON(counties_url)
countycodes = countycodes[[2]]
california_counties = countycodes$code

#Get parameter codes
parameters_url = "https://aqs.epa.gov/data/api/list/parametersByClass?email=jeffreywu@ucsb.edu&key=copperheron86&pc=CRITERIA"

parametercodes = fromJSON(parameters_url)
parametercodes = parametercodes[[2]]

#Do we want to keep Lead PM10 or delete
parametercodes = parametercodes[-7,]

pollutants = data.frame(parametercodes$code)
labels = c("lead","co","so2","no2","o3","pm10","pm25")
pollutants = cbind(pollutants,labels)

Identifying a set of monitoring stations that are well distributed across CA

The goal here is to query all of the EPA measurements for all 7 pollutants and AQI from 2014-2019. The first step is to query all stations in California that collect measurements for at least one of the 7 pollutants.

FUNCTION THAT QUERIES STATION LOCATIONS FOR A GIVEN POLLUTANT

query_aqs_station_data <- function(param,year){
  start_date <- paste0(year, "0101")
  end_date <- paste0(year, "1231")
  
  url <- paste0("https://aqs.epa.gov/data/api/monitors/byState?email=jeffreywu@ucsb.edu&key=copperheron86&param=", param, "&bdate=", start_date, "&edate=", end_date, "&state=06")
  
  myData <- fromJSON(url)
  station_data = myData[[2]]
  
  return(station_data)
}

FOR EACH POLLUTANT, GRAB ALL MONITORING STATIONS FOR EACH YEAR

stations_url = "https://aqs.epa.gov/data/api/monitors/byState?email=jeffreywu@ucsb.edu&key=copperheron86&param=88101&bdate=20140101&edate=20141231&state=06"

stations = fromJSON(stations_url)
stations = stations[[2]]

station_data2014_pm2.5 = stations %>% select(latitude,longitude,site_number, local_site_name,county_code,county_name)

#Get monitoring station locations for each pollutant for each year (takes approx 3 min)
all_pollutants_station_data <- list()
for (year in 2014:2019){
  year_data <- lapply(pollutants, query_aqs_station_data, year = year)
  all_pollutants_station_data[[as.character(year)]] <- year_data
}


# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5

all_pollutants_station_2014data = all_pollutants_station_data[[1]]
all_pollutants_station_2015data = all_pollutants_station_data[[2]]
all_pollutants_station_2016data = all_pollutants_station_data[[3]]
all_pollutants_station_2017data = all_pollutants_station_data[[4]]
all_pollutants_station_2018data = all_pollutants_station_data[[5]]
all_pollutants_station_2019data = all_pollutants_station_data[[6]]

Identify subset of stations that has best spatial coverage wrt CA state

Starting off with a list of all the stations measuring each parameter/pollutant, we want to identify a subset of stations that measure each pollutant for each county. This overall set of stations (subset for each county combined together) should have a good spatial coverage of the state.

This is difficult because there is not a station in every county measuring each pollutant. So in order to identify a good set of stations to query data from, I first looked up the two largest cities in each county, based on population.

Lat/long for 2 biggest cities (based on population) in each county

citylats = c(37.8044,37.5485,38.7743,38.8071,38.3527,38.3488,39.7285,39.7596,
             38.1231,38.0678,39.2143,39.1546,37.9780,38.0049,41.7558,41.7548,
             38.9399,38.6688,36.7378,36.8252,39.7474,39.5243,40.8021,40.8665,
             32.7920,32.6789,37.3614,37.3855,35.3733,35.7688,36.3275,36.3008,
             38.9582,38.8080,40.4163,40.2840,34.0522,33.7701,36.9613,37.1230,
             37.9735,38.1074,37.4849,37.4320,39.4457,39.4096,37.3022,37.0583,
             41.4871,41.4099,37.6485,38.5149,36.6777,36.6149,38.2975,38.1749,
             39.3280,39.2191,33.8366,33.7455,38.7521,38.7907,39.9341,40.3063,
             33.9806,33.9425,38.5816,38.4088,36.8525,36.8125,34.1083,34.0922,
             32.7157,32.6401,37.7749,37.9780,37.9577,37.7396,35.2828,35.6369,
             37.6879,37.5630,34.9530,34.4208,37.3387,37.3688,36.9741,36.9102,
             40.5865,40.4482,39.6763,39.5595,41.7354,41.3099,38.1041,38.2492,
             38.4404,38.2324,37.6393,37.4946,39.1404,39.1165,40.1785,39.9277,
             40.7310,40.4156,36.3301,36.2077,38.0297,37.9829,34.1975,34.1706,
             38.5449,38.6785,39.1277,39.0954)

citylongs = c(122.2712,121.9886,119.8219,119.7960,120.9327,120.7741,121.8375,121.6219,
              120.8509,120.5385,122.0094,122.1494,122.0311,121.8058,124.2026,124.1580,
              119.9772,120.9872,119.7871,119.7029,122.1964,122.1936,124.1637,124.0828,
              115.5631,115.4989,118.3997,118.4105,119.0187,119.2471,119.6457,119.7829,
              122.6264,122.5583,120.6530,120.5394,118.2437,118.1937,120.0607,120.2602,
              122.5311,122.5697,119.9663,120.0985,123.8053,123.3556,120.4830,120.8499,
              120.5425,120.6791,118.9721,119.4768,121.6555,121.8221,122.2869,122.2608,
              120.1833,121.0611,117.9143,117.8677,121.2880,121.2358,120.8980,121.2319,
              117.3755,117.2297,121.4944,121.3716,121.4016,121.3658,117.2898,117.4350,
              117.1611,117.0842,122.4194,122.0311,121.2908,121.4260,120.6596,120.6545,
              122.4702,122.3255,120.4357,119.6982,121.8853,122.0363,122.0308,121.7569,
              122.3917,122.2978,120.2410,120.8277,122.6345,122.3106,122.2566,122.0405,
              122.7141,122.6367,120.9970,120.8460,121.6169,121.6380,122.2358,122.1792,
              122.9420,123.2100,119.2966,119.3473,119.9741,120.3822,119.1771,118.8376,
              121.7405,121.7733,121.5508,121.5522)
citylongs = -1*citylongs

Alameda: Oakland (429082) and Fremont

Alpine: Alpine Village (225) and Mesa Vista

Amador: Ione (8363) and Jackson

Butte: Chico (94776) and Paradise

Calaveras: Rancho Calaveras (5324) and Angels Camp

Colusa: Colusa (5911) and Williams

Contra Costa: Concord (129688) and Antioch

Del Norte: Crescent City (6805) and Bertsch-Oceanview

El Dorado: South Lake Tahoe (22036) and Cameron Park

Fresno: Fresno (530093) and Clovis

Glenn: Orland (7644) and Willows

Humboldt: Eureka (26998) and Arcata

Imperial: El Centro (44120) and Calexico

Inyo: Bishop (3746) and Dixon Lane-Meadow Creek

Kern: Bakersfield (383579) and Delano

Kings: Hanford (56910) and Lemoore

Lake: Clearlake (15384) and Hidden Valley Lake

Lassen: Susanville (15165) and Janesville

Los Angeles: Los Angeles (3990000) and Long Beach

Madera: Madera (65706) and Chowchilla

Marin: San Rafael (58704) and Novato

Mariposa: Mariposa (1526) and Catheys Valley

Mendocino: Fort Bragg (7359) and Willits

Merced: Merced (83316) and Los Banos

Modoc: Alturas (2509) and California Pines

Mono: Mammoth Lakes (8127) and Walker

Monterey: Salinas (156259) and Seaside

Napa: Napa (79263) and American Canyon

Nevada: Truckee (16561) and Grass Valley

Orange: Anaheim (352005) and Santa Ana

Placer: Roseville (139117) and Rocklin

Plumas: East Quincy (2489) and Chester

Riverside: Riverside (330063) and Moreno Valley

Sacramento: Sacramento (508529) and Elk Grove

San Benito: Hollister (39749) and Ridgemark

San Bernandino: San Bernandino (215941) and Fontana

San Diego: San Diego (1426000) and Chula Vista

San Francisco: San Francisco (810000) and Concord

San Joaquin: Stockton (311178) and Tracy

San Luis Obispo: San Luis Obispo (47446) and Paso Robles

San Mateo: Daly City (107008) and San Mateo

Santa Barbara: Santa Maria (107408) and Santa Barbara

Santa Clara: San Jose (1030000) and Sunnyvale

Santa Cruz: Santa Cruz (64725) and Watsonville

Shasta: Redding (91772) and Anderson

Sierra: Loyalton (700) and Downieville

Siskiyou: Yreka (7556) and Mount Shasta

Solano: Vallejo (121913) and Fairfield

Sonoma: Santa Rosa (177586) and Petaluma

Stanislaus: Modesto (215030) and Turlock

Sutter: Yuba City and South Yuba City

Tehama: Red Bluff (14283) and Corning

Trinity: Weaverville (3667) and Post Mountain

Tulare: Visalia (133800) and Tulare

Tuolumne: Phoenix Lake-Cedar Ridge (5108) and Sonora

Ventura: Oxnard (209877) and Thousand Oaks

Yolo: Davis (69289) and Woodland

Yuba: Linda (17773) and Olivehurst

Then, I created the function below to choose a group of stations that are within a certain distance (Haversine distance from the latitude and longitude) of each city that I identified in the previous step. If there are less than 5 stations associated to a given city, the distance threshold (which starts at 100km) is increased by 50km.

FUNCTION THAT SELECTS SET OF STATIONS CLOSEST TO A GIVEN LAT/LONG

# Function to filter stations based on spatial coverage
subset_stations_by_spatial_coverage <- function(station_data, reference_lat, reference_lon, max_distance_km=100) {
  # Calculate distances between stations and reference location
  distances <- distHaversine(
    cbind(station_data$longitude, station_data$latitude),
    c(reference_lon, reference_lat)
  )
  distances <- distances/1000
  
  # idx =  which(distances == min(distances))
  # #Identify station within min distance to centroid of county
  # station_data_subset <- station_data[idx, ]
  
  # Subset stations within the specified max_distance_km
  idx = which(distances <= max_distance_km)
  station_data_subset <- station_data[idx, ]
  station_data_subset <- cbind(station_data_subset,distances[idx])
  
  while (nrow(station_data_subset) < 5){
    max_distance_km = max_distance_km + 50
    station_data_subset = subset_stations_by_spatial_coverage(station_data, 
                              reference_lat, reference_lon, max_distance_km)
  }
  
  return(station_data_subset)
}

# # Construct the subset of stations based on spatial coverage criteria (test)
# reference_lat = citylats[3]
# reference_lon = citylongs[3]
# max_distance_km = 100
# 
# subset_stations <- subset_stations_by_spatial_coverage(station_data2014_pm2.5, reference_lat, reference_lon, max_distance_km)
# 
# # Print the subset of stations
# print(subset_stations)
# Obtain centroid lat/longs for each county 
CA.counties2 = read.csv("counties.ca.data.csv")
ca.coordinates = data.frame(CA.counties2$county,CA.counties2$lat,CA.counties2$lng)
colnames(ca.coordinates) = c("county","lat","long")

ca.coordinates = ca.coordinates[order(ca.coordinates$county),]
row.names(ca.coordinates) = NULL

IMPORTANT FUNCTION:

Given a dataset containing station locations/codes for a given pollutant and year, the function below selects 5-20 stations that are closest to the lat/longs for the two biggest cities in each county and puts the station information (code, lat, long, etc) into a dataframe.

#Function that finds best monitoring station for each county for a specific pollutant for a specific year
# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5

best_stations = function(stationdata,pollutant){

  subset_list = list()
  
  #Load lat/longs for 58x2 cities into dataframe
  CA.coords = data.frame(rep(countycodes$value_represented,each = 2),citylats,citylongs)
  colnames(CA.coords) = c("County","Lat","Long")
  
  #Find closest station for each county centroid using subset_stations_by_spatial_coverage function
  for (i in 1:nrow(CA.coords)){
  reference_lat = CA.coords$Lat[i]
  reference_lon = CA.coords$Long[i]
  max_distance_km = 100
  
  subset_stations <- subset_stations_by_spatial_coverage(stationdata[[pollutant]], reference_lat, reference_lon, max_distance_km)
  subset_list[[i]] = subset_stations
  }
  
  #Combine pairs of city lists together 
  subset_list2 = list()
  sequence = seq(2,116,2)
  for(i in sequence){
    combine = rbind(subset_list[[i]],subset_list[[i-1]])
    subset_list2[[i-1]] = combine
  }
  subset_list2 =subset_list2[!sapply(subset_list2,is.null)]
  
  #Create a county label vector
  repnames = c()
  for(i in 1:58){
    repnames = c(repnames,nrow(subset_list2[[i]]))
  }  
  countylabels = rep(countycodes$value_represented,times = repnames)
  
  #Format the list into dataframe
  beststations = as.data.frame(do.call(rbind, subset_list2))
  beststations = cbind(countylabels,beststations$county_name,
                       beststations$`distances[idx]`,beststations)
  colnames(beststations)[c(1,2,3)] = c("measuring_county","station_county","distance_apart")
  rownames(beststations) = NULL
  
  return(beststations)
}

# #test cases
# pm2.5_stations_2014 = best_stations(all_pollutants_station_2014data,7)
# CO_stations_2016 = best_stations(all_pollutants_station_2016data,2)

CREATING BEST STATION LIST/DATAFRAME FOR EACH POLLUTANT, EACH ENTRY IS A YEAR

#Generate list for best stations for each pollutant for each year
Lead_stations = list()

Lead_stations[[1]] = best_stations(all_pollutants_station_2014data,1)
Lead_stations[[2]] = best_stations(all_pollutants_station_2015data,1)
Lead_stations[[3]] = best_stations(all_pollutants_station_2016data,1)
Lead_stations[[4]] = best_stations(all_pollutants_station_2017data,1)
Lead_stations[[5]] = best_stations(all_pollutants_station_2018data,1)
Lead_stations[[6]] = best_stations(all_pollutants_station_2019data,1)



CO_stations = list()

CO_stations[[1]] = best_stations(all_pollutants_station_2014data,2)
CO_stations[[2]] = best_stations(all_pollutants_station_2015data,2)
CO_stations[[3]] = best_stations(all_pollutants_station_2016data,2)
CO_stations[[4]] = best_stations(all_pollutants_station_2017data,2)
CO_stations[[5]] = best_stations(all_pollutants_station_2018data,2)
CO_stations[[6]] = best_stations(all_pollutants_station_2019data,2)



SO2_stations = list()

SO2_stations[[1]] = best_stations(all_pollutants_station_2014data,3)
SO2_stations[[2]] = best_stations(all_pollutants_station_2015data,3)
SO2_stations[[3]] = best_stations(all_pollutants_station_2016data,3)
SO2_stations[[4]] = best_stations(all_pollutants_station_2017data,3)
SO2_stations[[5]] = best_stations(all_pollutants_station_2018data,3)
SO2_stations[[6]] = best_stations(all_pollutants_station_2019data,3)



NO2_stations = list()

NO2_stations[[1]] = best_stations(all_pollutants_station_2014data,4)
NO2_stations[[2]] = best_stations(all_pollutants_station_2015data,4)
NO2_stations[[3]] = best_stations(all_pollutants_station_2016data,4)
NO2_stations[[4]] = best_stations(all_pollutants_station_2017data,4)
NO2_stations[[5]] = best_stations(all_pollutants_station_2018data,4)
NO2_stations[[6]] = best_stations(all_pollutants_station_2019data,4)



O3_stations = list()

O3_stations[[1]] = best_stations(all_pollutants_station_2014data,5)
O3_stations[[2]] = best_stations(all_pollutants_station_2015data,5)
O3_stations[[3]] = best_stations(all_pollutants_station_2016data,5)
O3_stations[[4]] = best_stations(all_pollutants_station_2017data,5)
O3_stations[[5]] = best_stations(all_pollutants_station_2018data,5)
O3_stations[[6]] = best_stations(all_pollutants_station_2019data,5)



PM10_stations = list()

PM10_stations[[1]] = best_stations(all_pollutants_station_2014data,6)
PM10_stations[[2]] = best_stations(all_pollutants_station_2015data,6)
PM10_stations[[3]] = best_stations(all_pollutants_station_2016data,6)
PM10_stations[[4]] = best_stations(all_pollutants_station_2017data,6)
PM10_stations[[5]] = best_stations(all_pollutants_station_2018data,6)
PM10_stations[[6]] = best_stations(all_pollutants_station_2019data,6)



# Lead.PM10_stations = list()
# 
# Lead.PM10_stations[[1]] = best_stations(all_pollutants_station_2014data,7)
# Lead.PM10_stations[[2]] = best_stations(all_pollutants_station_2015data,7)
# Lead.PM10_stations[[3]] = best_stations(all_pollutants_station_2016data,7)
# Lead.PM10_stations[[4]] = best_stations(all_pollutants_station_2017data,7)
# Lead.PM10_stations[[5]] = best_stations(all_pollutants_station_2018data,7)
# Lead.PM10_stations[[6]] = best_stations(all_pollutants_station_2019data,7)



PM2.5_stations = list()

PM2.5_stations[[1]] = best_stations(all_pollutants_station_2014data,7)
PM2.5_stations[[2]] = best_stations(all_pollutants_station_2015data,7)
PM2.5_stations[[3]] = best_stations(all_pollutants_station_2016data,7)
PM2.5_stations[[4]] = best_stations(all_pollutants_station_2017data,7)
PM2.5_stations[[5]] = best_stations(all_pollutants_station_2018data,7)
PM2.5_stations[[6]] = best_stations(all_pollutants_station_2019data,7)

Heatmap of population with station locations marked (2015)

Do the stations provide a good spatial coverage of California? To me, the coverage is reasonable especially because most of the northern and eastern counties are where most of the sparsely populated counties are located. There are probably not that many EPA stations there as a result.

pollutants1_2015 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants1_2015_8.17.RData")
pollutants2_2015 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2015_8.17.RData")

stationlats = c(unique(pollutants1_2015$latitude),unique(pollutants2_2015$latitude))
stationlongs = c(unique(pollutants1_2015$longitude),unique(pollutants2_2015$longitude))

station_points = data.frame(stationlats,stationlongs)

#Plot
gg_pop_stations <- ggplot() +
  geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = pop), 
               color = "black") +
  coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
  theme_void() +
  labs(title = "Heatmap of County Populations with Station Locations for 2015") +
  scale_fill_gradient(low = "lightblue", high = "darkblue")

# Add points
gg_pop_stations <- gg_pop_stations +
  geom_point(data = station_points, aes(x = stationlongs, y = stationlats), 
             color = "red", size = 1.5)

print(gg_pop_stations)

Downloading and aggregating air quality data using direct API calls

Given a set of 5-20 monitoring stations for each county, we loop through its station codes (for each year 2014-2019) and query using the EPA’s AQS function. This function only allows you to query a maximum of 4 parameters at once for a single year, so two calls to the function have to be made for each year (4 and 3).

(BELOW SHOWS IT BEING DONE FOR 2019)

END GOAL FINAL FORM: ONE BIG DATAFRAME (ALL POLLUTANTS ALL YEARS TOGETHER, USE FILTER TO SEPARATE)

# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5

stations2019x = rbind(Lead_stations[[6]],CO_stations[[6]],
             SO2_stations[[6]],NO2_stations[[6]])
stations2019y = rbind(O3_stations[[6]],PM10_stations[[6]],PM2.5_stations[[6]])

sitenums2019x = stations2019x %>% select(county_code,site_number) %>% unique() #198 stations
sitenums2019y = stations2019y %>% select(county_code,site_number) %>% unique() #178 stations


#Trying EPA R Package query (took 15 + 20 min!) gives us a dataframe 
ccodes = sitenums2019y$county_code
snums = sitenums2019y$site_number
str1 = "2019-01-01"
str2 = "2019-12-31"

pollutants1_2019 = aqs_dailysummary_by_site(parameter = c("14129","42101","42401","42602"),bdate = as.Date(str1),edate = as.Date(str2),stateFIPS = "06",countycode = ccodes,sitenum = snums)

pollutants2_2019 = aqs_dailysummary_by_site(parameter = c("44201","81102","88101"),bdate = as.Date(str1),edate = as.Date(str2),stateFIPS = "06",countycode = ccodes,sitenum = snums)

###SAVE LIST LOCALLY
saveRDS(pollutants2_2019,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2019_8.18.RData")

After querying the raw data for each pollutant and each year, we want to go through it and remove any data from stations that have “bad” data. The standards that I set (for now) are that for a given year, a station should have at least 240 of the 365 days included for a year of data. Additionally, I applied a Hampel filter to identify outliers and if there are more than 14 consecutive outliers in the data i.e., days in a row with measurements that are abnormal, I considered the data from that station to not be suitable for inclusion in the final dataset.

NOTE: the daily values reported in the raw dataset are actually daily averages from periodic measurements made by the station throughout the day

After data from so called “bad” stations were removed, another function is applied which aggregates the daily observations into a monthly median The raw data is in the form of a single dataframe, which is fed into the important function raw_transform(). This function uses several functions to create a list of dataframes, one for each county, which represent the monthly median measurements for a certain pollutant in a given county in a given year.

QUALITY CHECK FUNCTION FOR STATION DATA: WANT TO ADDRESS OUTLIERS, MISSINGNESS

#Given a dataset like CO2016 (list of 1000ish stations), check for 2/3 missing data and for strings of outliers (14 in a row)

station_quality_check = function(station_data){
  l = length(station_data)
  badindex = c()
  consecutive_outliers = list()
  
  for (i in 1:l){
    aqi = station_data[[i]]$aqi
    pollutant_level = station_data[[i]]$arithmetic_mean
    
    #check for outliers in AQI
    median.aqi = median(na.omit(aqi))
    mad.aqi = mad(na.omit(aqi))
    
    min.aqi = median.aqi-(3*mad.aqi)
    max.aqi = median.aqi+(3*mad.aqi)
    
    outliers.aqi = which(aqi < min.aqi | aqi > max.aqi)
    
    result.aqi = rle(diff(outliers.aqi))

    
    #check for outliers in pollutant measure
    median.pollutant = median(na.omit(pollutant_level))
    mad.pollutant = mad(na.omit(pollutant_level))
    
    min.pollutant = median.pollutant-(3*mad.pollutant)
    max.pollutant = median.pollutant+(3*mad.pollutant)
    
    outliers.pollutant = which(pollutant_level < min.pollutant | pollutant_level > max.pollutant)
    
    result.pollutants = rle(diff(outliers.pollutant))
    
    
    if (nrow(station_data[[i]]) < 240){
      badindex = c(badindex,i)
    }
    
    else if (any(result.aqi$lengths >= 14 & result.aqi$values == 1) == TRUE){
      badindex = c(badindex,i)
    }
    
    else if (any(result.pollutants$lengths >= 14 & result.pollutants$values == 1) == TRUE){
      badindex = c(badindex,i)
    }
    
    consecutive_outliers[[i]] = c("AQI",outliers.aqi,"POLLUTANTS",outliers.pollutant)
  }
  
  bad_list = list(badindex,consecutive_outliers)
  
  return(bad_list)
}


#Test on CO2016 and CO2017
# station_quality_check(CO2016) #returns 61 "bad stations" out of 1230
# 
# removeidx = station_quality_check(SO22017)[[1]] #returns 715 out of 1056 "bad stations" 
# 
# test = SO22017[- removeidx]

FUNCTION THAT AGGREGATES DAILY DATA INTO MONTHYLY MEDIANS

monthly_agg = function(pollutantdata){
  #Aggregating all the station data at once
  date = ymd(pollutantdata$date_local)
  df2 <- pollutantdata                                   # Duplicate data
  df2$year_month <- floor_date(date,"month")  # Create year-month column
  df3 = df2 %>% select(county,site_number,arithmetic_mean,aqi,year_month) %>% as.data.frame()
  
  df3$arithmetic_mean = as.numeric(df3$arithmetic_mean)
  df3$aqi[which(df3$aqi == "NULL")] = NA
  df3$aqi = as.numeric(df3$aqi)
  
  df.agg = df3 %>% group_by(year_month) %>% dplyr::summarize(arithmetic_mean = median(na.omit(arithmetic_mean)),aqi = median(na.omit(aqi))) %>% as.data.frame()
  
  return(df.agg)
}

IMPORTANT FUNCTION: TRANSFORMING RAW DATA TO FINAL FORM

# Group 1: 14129 - Lead, 421012 - Carbon monoxide (CO), 42401 - Sulfure dioxide (SO2), 42602 - Nitrogen dioxide (NO2)
# Group 2: 44201 - Ozone (O3), 81102 - Total PM10, 88101 - PM2.5

raw_transform = function(rawdata,reference_list,standard){
  
  ###SEPARATE DF INTO A LIST OF DFs 
  
  matched_list = list()
  
  if(missing(standard)){
    for (i in 1:nrow(reference_list)){
    data = rawdata %>% filter(county_code == reference_list$county_code[i], site_number == reference_list$site_number[i])
  
    matched_list[[i]] = data
    }
  } else {
      for (i in 1:nrow(reference_list)){
      data = rawdata %>% filter(county_code == reference_list$county_code[i], site_number == reference_list$site_number[i],pollutant_standard == standard)

      matched_list[[i]] = data
    }
  }
  
  names(matched_list) = reference_list$measuring_county
  
  ###STATION QUALITY CHECK
  
  removeidx = station_quality_check(matched_list)[[1]]
  good_matched_list = matched_list[- removeidx]
  
  #Convert list back into one big dataframe
  temp = as.data.frame(do.call(rbind, good_matched_list)) #TOO MANY ROWS RIGHT?
  good_df = unique.data.frame(temp)
  
  
  ###MAKE A LIST OF COMBINED STATION DATA FOR EACH COUNTY
  mid_list = list()

  for (i in unique(reference_list$measuring_county)){
    
    df_new = data.frame(good_df[1,])
    subset = reference_list %>% filter(measuring_county == i) %>% select(county_code,site_number)
  
    for (j in 1:nrow(subset)){
      pull = good_df %>% filter(county_code == reference_list$county_code[j], site_number == reference_list$site_number[j])
      
      df_new = rbind(df_new,pull)
    }
    
    df_new = df_new[-1,]
    mid_list[[i]] = df_new
  }
  
  ###AGGREGATE DAILY DATA TO MONTHLY FOR EACH COUNTY
  
  final_list = lapply(mid_list,monthly_agg)
  
  return(final_list)
}

When assembling final datasets, note that certain pollutant standards are used bc they have values for AQI… the ones I used were:

Lead: Lead 3-Month 2009 ?? Has all NAs for AQI

CO: CO 8-hour 1971

SO2: SO2 1-hour 2010

NO2: NO2 1-hour 2010

O3: Ozone 8-hour 2015 ; sample duration should be 8 HR

PM10: PM10 24-hour 2006

PM2.5: PM25 24-hour 2012

APPLY RAW TRANSFORM FUNCTION TO ALL POLLUTANTS FOR ALL YEARS (BELOW SHOWS IT BEING DONE FOR 2019)

#Load raw data
pollutants1_2019 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants1_2019_8.18.RData")
pollutants2_2019 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2019_8.18.RData")

Lead2019 = pollutants1_2019 %>% filter(parameter_code == "14129")
CO2019 = pollutants1_2019 %>% filter(parameter_code == "42101")
SO22019 = pollutants1_2019 %>% filter(parameter_code == "42401")
NO22019 = pollutants1_2019 %>% filter(parameter_code == "42602")
O32019 = pollutants2_2019 %>% filter(parameter_code == "44201")
PM102019 = pollutants2_2019 %>% filter(parameter_code == "81102")
PM2.52019 = pollutants2_2019 %>% filter(parameter_code == "88101")


Lead2019_final = raw_transform(rawdata = Lead2019,reference_list = Lead_stations[[6]],standard = "Lead 3-Month 2009")

CO2019_final = raw_transform(rawdata = CO2019,reference_list = CO_stations[[6]],standard = "CO 8-hour 1971")

SO22019_final = raw_transform(rawdata = SO22019,reference_list = SO2_stations[[6]],standard = "SO2 1-hour 2010")

NO22019_final = raw_transform(rawdata = NO22019,reference_list = NO2_stations[[6]],standard = "NO2 1-hour 2010")

O32019_final = raw_transform(rawdata = O32019,reference_list = O3_stations[[6]],standard = "Ozone 8-hour 2015")

PM102019_final = raw_transform(rawdata = PM102019,reference_list = PM10_stations[[6]],standard = "PM10 24-hour 2006") 

PM2.52019_final = raw_transform(rawdata = PM2.52019,reference_list = PM2.5_stations[[6]],standard = "PM25 24-hour 2012")

COMBINING EACH POLLUTANTS DATASET INTO A SINGLE DATAFRAME FOR THE YEAR (BELOW SHOWS IT BEING DONE FOR 2019)

###Combine final data into one dataframe for 2014 
test1 = as.data.frame(do.call(rbind, Lead2019_final))
test1 = cbind(test1,rep(pollutants$parametercodes.code[1],nrow(test1))) #maybe change parameter codes to 1-7?
colnames(test1) = c("Year-Month","Value","AQI","Pollutant")

test2 = as.data.frame(do.call(rbind, CO2019_final))
test2 = cbind(test2,rep(pollutants$parametercodes.code[2],nrow(test2))) #maybe change parameter codes to 1-7?
colnames(test2) = c("Year-Month","Value","AQI","Pollutant")

test3 = as.data.frame(do.call(rbind, SO22019_final))
test3 = cbind(test3,rep(pollutants$parametercodes.code[3],nrow(test3))) #maybe change parameter codes to 1-7?
colnames(test3) = c("Year-Month","Value","AQI","Pollutant")

test4 = as.data.frame(do.call(rbind, NO22019_final))
test4 = cbind(test4,rep(pollutants$parametercodes.code[4],nrow(test4))) #maybe change parameter codes to 1-7?
colnames(test4) = c("Year-Month","Value","AQI","Pollutant")

test5 = as.data.frame(do.call(rbind, O32019_final))
test5 = cbind(test5,rep(pollutants$parametercodes.code[5],nrow(test5))) #maybe change parameter codes to 1-7?
colnames(test5) = c("Year-Month","Value","AQI","Pollutant")

test6 = as.data.frame(do.call(rbind, PM102019_final))
test6 = cbind(test6,rep(pollutants$parametercodes.code[6],nrow(test6))) #maybe change parameter codes to 1-7?
colnames(test6) = c("Year-Month","Value","AQI","Pollutant")

test7 = as.data.frame(do.call(rbind, PM2.52019_final))
test7 = cbind(test7,rep(pollutants$parametercodes.code[7],nrow(test7))) #maybe change parameter codes to 1-7?
colnames(test7) = c("Year-Month","Value","AQI","Pollutant")

#Combine each pollutant dataset into one big dataset for the year
final_data19 = rbind(test1,test2,test3,test4,test5,test6,test7)

###SAVE FINAL DATASET LOCALLY
saveRDS(final_data19,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")

HAVE TO CLEAN DATA BEFORE FINALIZING:

Having combined the monthly medians for every county for a single year for each pollutant into a single dataframe, one final cleaning step must be performed before putting each years’ data together. The AQI value that is in each row corresponds to the AQI standardized measurement for that specific pollutant. Each pollutant has a different standardizing equation, but once they are all standardized as they are in the dataset and they can be compared against each other. The reported AQI measurement for a given day is just the maximum of the AQI values corresponding to each of the 7 pollutants. So the maximum AQI value (among 6 values bc Lead observations never have AQI values) was found for each month and that value was set as the actual AQI value for that month in all corresponding rows.

FIND MAX AQI (AMONG THE 7 POLLUTANTS) FOR EACH MONTH -> SET AS ACTUAL AQI FOR THAT MONTH

(BELOW SHOWS IT BEING DONE FOR 2019)

#Do for each year 
months = c("01","02","03","04","05","06","07","08","09","10","11","12")

###Do for each year 
for (i in 1:58){
  idx1 = which(stringr::str_starts(rownames(final_data19), counties[i]))
  subset1 = final_data19[idx1,]
  subset1$`Year-Month`= as.Date(subset1$`Year-Month`)
  
  for (j in months){ 
  #Filter by county and date
    date = paste0("2019-",j,"-01")
    date = as.Date(date)
    subset2 = subset1 %>% filter(`Year-Month` == as.Date(date)) 
    
    trueAQI = max(na.omit(subset2$AQI))
    
    idx2 = which(subset1$`Year-Month` == date)
    subset1$AQI[idx2] = trueAQI
  }
  
  final_data19[idx1,] = subset1
}

###SAVE FINAL DATASET LOCALLY
saveRDS(final_data19,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")

COMBINING EACH YEARS DATASET INTO ONE BIG TIDY DATAFRAME FOR AIR QUALITY COVARIATES

FILL IN YOUR OWN FILE DIRECTORIES HERE! START WORKING W ACTUAL EPA DATA FROM HERE ON

final_data14 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data14_9.1.RData")
final_data15 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data15_9.1.RData")
final_data16 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data16_9.1.RData")
final_data17 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data17_9.1.RData")
final_data18 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data18_9.1.RData")
final_data19 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")

final_EPA_data = rbind(final_data14,final_data15,final_data16,final_data17,
                       final_data18,final_data19)
saveRDS(final_EPA_data,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_EPA_data_9.1.RData")


final_EPA_data = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_EPA_data_9.1.RData")
head(final_EPA_data)
##           Year-Month  Value AQI Pollutant
## Alameda.1 2014-01-01 0.0500  60     14129
## Alameda.2 2014-02-01 0.0520  28     14129
## Alameda.3 2014-03-01 0.0655  34     14129
## Alameda.4 2014-04-01 0.0360  40     14129
## Alameda.5 2014-05-01 0.0300  38     14129
## Alameda.6 2014-06-01 0.0230  37     14129

Adding cluster labels to the EPA data and aggregating based on clusters

Once the EPA data is in the correct format (one big dataframe) at the county level, we now need to aggregate it to the cluster level according to the SKATER cluster labels from before. A population weighted mean was used to aggregate here as well, MAY OR MAY NOT BE APPROPRIATE?

Cluster = rep(1,length(final_EPA_data))
final_EPA_agg_data = cbind(final_EPA_data,Cluster)

for (i in 1:58){
  idx = which(stringr::str_starts(rownames(final_EPA_agg_data), counties[i]))
  final_EPA_agg_data$Cluster[idx] = clusterlabels$Cluster[i]
}

Time = c(rep(c(1:12),58),rep(13:24,58),rep(25:36,58),rep(37:48,58),rep(49:60,58),rep(61:72,58)) 
Time = rep(Time,7)
final_EPA_agg_data = cbind(Time,final_EPA_agg_data)

AGGREGATE CLUSTER DATA AND COMBINE INTO ONE DATAFRAME

countypops = CA_data %>% filter(Year > 2013) %>% select(Total_Pop,County,Year) %>% unique()
countypops = cbind(countypops,Cluster = rep(clusterlabels$Cluster,each=6))
countypops$County = rep(counties,each=6)

temp_EPA_agg_data = data.frame(final_EPA_agg_data[1,-2])
num_clus = length(unique(clusterlabels$Cluster))

for (k in 1:num_clus){
  
  EPA_clus_k = data.frame(final_EPA_agg_data[1,-2])
  
  for (i in pollutants$parametercodes.code){
  pollutant_data = final_EPA_agg_data %>% filter(Pollutant == i)

  cluster_data = pollutant_data %>% filter(Cluster == k)
  cluster_data$Value = scale(cluster_data$Value)
  cluster_data$AQI = scale(cluster_data$AQI)
  year = 2014
  
  for(j in 1:72){
    cluster_data_j = cluster_data %>% filter(Time == j)
    cluster_counties = countypops %>% filter(Cluster == k,Year == year)
    
    pops = countypops %>% filter(Year == year,Cluster == k) %>% select(Total_Pop) 
    cluster.pop = sum(pops)
    cluster.popweights = pops/cluster.pop
    
    value_wmean = weighted.mean(cluster_data_j$Value,cluster.popweights$Total_Pop)
    aqi_wmean = weighted.mean(cluster_data_j$AQI,cluster.popweights$Total_Pop)
    insert = data.frame(Time = j,value_wmean,aqi_wmean,
                        Pollutant = i,Cluster = k)
    colnames(insert) = colnames(EPA_clus_k)
    
    EPA_clus_k = rbind(EPA_clus_k,insert)
    
    if ((j>12) & (j<25)){
      year = 2015
    }
    
    else if ((j>24) & (j<37)){
      year = 2016
    }
    
    else if ((j>36) & (j<49)){
      year = 2017
    }
    
    else if ((j>48) & (j<61)){
      year = 2018
    }
    
    else if ((j>60) & (j<73)){
      year = 2019
    }
    
    else{
      year = 2014
    }
  }
  }
  
  EPA_clus_k = EPA_clus_k[-1,]
  trueAQI = EPA_clus_k$AQI[1:72]
  trueAQI = rep(trueAQI,7)
  EPA_clus_k$AQI = trueAQI
  rownames(EPA_clus_k) = NULL
  
  temp_EPA_agg_data = rbind(temp_EPA_agg_data,EPA_clus_k)
  
}

final_EPA_agg_data = temp_EPA_agg_data[-1,]

saveRDS(final_EPA_agg_data,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_8.1_EPA_agg_data_10.26.RData")

final_EPA_agg_data = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_8.1_EPA_agg_data_10.26.RData")

head(final_EPA_agg_data)
##   Time     Value        AQI Pollutant Cluster
## 1    1 3.4872956  2.5383398     14129       1
## 2    2 3.7080056 -1.3061830     14129       1
## 3    3 5.1977985 -0.5285298     14129       1
## 4    4 1.9423252  0.1260502     14129       1
## 5    5 1.2801950 -0.1594367     14129       1
## 6    6 0.5077098 -0.4152849     14129       1

Plot time series of each EPA variable

library(astsa)
## Warning: package 'astsa' was built under R version 4.2.2
## 
## Attaching package: 'astsa'
## The following object is masked from 'package:maps':
## 
##     unemp
for (i in pollutants$parametercodes.code){
  for (j in 1:num_clus){
    EPA_clus_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == j)

    title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
    tsplot(EPA_clus_ts$Value,main = title,xlab = "Time (months)",ylab = "Value")  
  }
}

It turns out that the time series for Lead was not stationary, so we need to detrend the data before using it

for (i in pollutants$parametercodes.code){
  for (j in 1:num_clus){
    EPA_clus_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == j)
    lead_idx = which(final_EPA_agg_data$Pollutant == "14129" & final_EPA_agg_data$Cluster == j)
    
    if (i == "14129"){
      detrended1 = c(0,diff(EPA_clus_ts$Value,lag=1))
      final_EPA_agg_data[lead_idx,2] = detrended1
      
      title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
      tsplot(detrended1,main = title,xlab = "Time (months)",ylab = expression(nabla~Value))
    } else{
      title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
      tsplot(EPA_clus_ts$Value,main = title,xlab = "Time (months)",ylab = "Value")  
    }
  }
}

for (k in 1:num_clus){
  AQI_clus_ts = final_EPA_agg_data %>% filter(Pollutant == "14129") %>% filter(Cluster == k)
  title = sprintf("AQI - Cluster %1.0f",k)
  tsplot(AQI_clus_ts$AQI,main = title,xlab = "Time (months)",ylab = "Value")
}

Building covariate matrices to be used for gram matrix calculations

EPA_cluster_list = list()

cluster_EPA_data = function(cluster){
  EPA_clus_ts = matrix(nrow=72)

  for (i in pollutants$parametercodes.code){
    covariate_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == cluster)
    EPA_clus_ts = cbind(EPA_clus_ts,covariate_ts$Value)
  }
  EPA_clus_ts[,1] = covariate_ts$AQI
  colnames(EPA_clus_ts) = c("AQI","Lead","CO","SO2","NO2","O3","PM10","PM2.5")
  
  return(EPA_clus_ts)
}

for (i in 1:num_clus){
  EPA_cluster_list[[i]] = cluster_EPA_data(i)
}
decompose_ts = function(EPA_ts){
  S_scaled = EPA_ts 
  colnames(S_scaled) = c("AQI","Lead","CO","SO2","NO2","O3","PM10","PM2.5")
  S_scaled_ts = ts(S_scaled, frequency = 12)
  
  # S_decomposed = decompose(S_scaled_ts)
  
  ts_decomposed <- lapply(colnames(S_scaled_ts), function(x) {stl(S_scaled_ts[, x], s.window = "periodic")})
  names(ts_decomposed) = colnames(S_scaled_ts)
  
  S_seasonal = ts_decomposed[[1]]$time.series[,1]
  S_trend = ts_decomposed[[1]]$time.series[,2]
  S_random = ts_decomposed[[1]]$time.series[,3]
  
  for (i in 2:8){
    S_seasonal = cbind(S_seasonal,ts_decomposed[[i]]$time.series[,1])
    S_trend = cbind(S_trend,ts_decomposed[[i]]$time.series[,2])
    S_random = cbind(S_random,ts_decomposed[[i]]$time.series[,3])
  }
  
  colnames(S_trend)= colnames(S_scaled_ts)
  colnames(S_seasonal)= colnames(S_scaled_ts)
  colnames(S_random)= colnames(S_scaled_ts)
  
  S_DL = S_seasonal + S_random
  colnames(S_DL)= colnames(S_scaled_ts)
  
  S_random_int = S_random[12:72,]
  S_random = S_random[13:72,]
  S_seasonal = S_seasonal[13:72,]
  S_trend = S_trend[13:72,]
  S_DL = S_DL[13:72,]
  
  S_DL = data.frame(S_DL)
  S_DL2 = matrix(nrow=60)
  dl = c(3,6,12)
  col_num = 2
  for (i in dl){
    for (j in 1:ncol(S_DL)){
      extract = S_DL[(72-59-i):(72-i),j]
      S_DL2 = cbind(S_DL2,extract)
      colnames(S_DL2)[col_num] = sprintf("B%1.0f-%s",i,colnames(S_DL)[j])
      col_num = col_num+1
    }
  }
  
  S_DL2 = S_DL2[,-1]
  S_DL_final = cbind(S_DL[13:72,],S_DL2)
  
  W = matrix(nrow=(nrow(S_random))^2)
  num_cols = ncol(S_random)
  # num_cols = ncol(S_DL_final) #for now, just calculate interaction pairs for actual covariates
  col_num = 2
  
  for (i in 1:num_cols){
    for (j in 1:num_cols){
      interaction_col = kronecker(S_random[,i],S_random[,j]) #replace S_random with S_DL_final for DL interactions 
      W = cbind(W,interaction_col)
      
      colnames(W)[col_num] = sprintf("%sx%s",colnames(S_scaled)[i],colnames(S_scaled)[j])
      col_num = col_num+1
    }
  }
  W = W[,-1]
  
  row1 = c()
  
  for (k in 1:ncol(S_random_int)){
    row1 = c(row1,S_random_int[2,k]*S_random_int[1,])
  }
  
  W2 = rbind(as.numeric(row1),W)
  W2 = W2[,-seq(1,64,by=9)] #need to change if we include DL interactions
  
  list = list(S_random,S_random_int,S_seasonal,S_DL,
              S_DL2,S_DL_final,S_trend,W2)
  names(list) = c("S_random","S_random_int","S_seasonal","S_DL",
              "S_DL2","S_DL_final","S_trend","W2")
  return(list)
}

decompose_clus1 = decompose_ts(EPA_cluster_list[[1]])
decompose_clus2 = decompose_ts(EPA_cluster_list[[2]])
decompose_clus3 = decompose_ts(EPA_cluster_list[[3]])
decompose_clus4 = decompose_ts(EPA_cluster_list[[4]])
decompose_clus5 = decompose_ts(EPA_cluster_list[[5]])
decompose_clus6 = decompose_ts(EPA_cluster_list[[6]])
decompose_clus7 = decompose_ts(EPA_cluster_list[[7]])

decomposed_cluster_data = list(decompose_clus1,decompose_clus2,decompose_clus3,
                            decompose_clus4,decompose_clus5,decompose_clus6,
                            decompose_clus7)

S_random_all = cbind(decompose_clus1$S_random,decompose_clus2$S_random,
                     decompose_clus3$S_random,decompose_clus4$S_random,
                     decompose_clus5$S_random,decompose_clus6$S_random,
                     decompose_clus7$S_random)

S_DL_all = cbind(cbind(decompose_clus1$S_DL,decompose_clus2$S_DL,
                     decompose_clus3$S_DL,decompose_clus4$S_DL,
                     decompose_clus5$S_DL,decompose_clus6$S_DL,
                     decompose_clus7$S_DL))

W2_all = cbind(cbind(decompose_clus1$W2,decompose_clus2$W2,
                     decompose_clus3$W2,decompose_clus4$W2,
                     decompose_clus5$W2,decompose_clus6$W2,
                     decompose_clus7$W2))

Power spectral density analysis to identify weights of significant lags :

library(psd)

S_psd = pspectrum(decompose_clus1$S_DL,plot=TRUE)
freq = 10^(S_psd$freq)
plot(S_psd$spec[,8]~S_psd$freq,xlab="frequency",ylab="spectral density",type = "l")



top5_peaks = sort(abs(S_psd$spec),decreasing = TRUE)[1:5]
top5_freqs = S_psd$freq[which(abs(S_psd$spec) == 0.4173804)]

spec_decomp = spectrum(S_seasonal)
# spec_decomp$spec
test.spec = spectrum(decompose_clus1$S_DL,span=3,log="no",plot=TRUE)
plot(test.spec$spec[,8]~test.spec$freq,xlab="frequency",ylab="spectral density",type = "l")
dom.freq=test.spec$freq[which.max(test.spec$spec[,8])]

Calculating autoregressive structure (linear time invariant approach)

Let \(\sigma^2_{AR} = \gamma(0)\) be the variance of one of our time series, we can find ACVF and ACF from Yule-Walker as

\(\gamma(k) = a_1 \gamma(k-1) + a_2 \gamma(k-2) + ... + a_p \gamma(k-p)\)

\(\rho(k) = \frac{\gamma(k)}{\gamma(0)} = a_1 \rho(k-1) + a_2 \rho(k-2) + ... + a_p \rho(k-p)\)

AR_invariant_list = list()

for (c in 1:num_clus){
  #Grab S_random data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_random_clus = cluster_data$S_random
  
  ar.corr.values = c()
  ar.cov.values = c()
  
  for (i in 1:ncol(S_random_clus)){
    var = var(S_random_clus[,i])
    fit.ar = ar(S_random_clus[,i],order.max = 1, aic = FALSE, method = "yule-walker")
    
    corr.ar1 = fit.ar$ar
    cov.ar1 = fit.ar$ar * var
    
    ar.corr.values = c(ar.corr.values,corr.ar1)
    ar.cov.values = c(ar.cov.values,cov.ar1)
  }
  
  for (j in 1:ncol(S_random_clus)){
    AR_invariant_covmatrix = diag(nrow(S_random_clus))
    
    AR_invariant_covmatrix[row(AR_invariant_covmatrix) == col(AR_invariant_covmatrix) - 1] = ar.cov.values[j]
    AR_invariant_covmatrix[row(AR_invariant_covmatrix) == col(AR_invariant_covmatrix) + 1] = ar.cov.values[j]
  }
  
  AR_invariant = diag(nrow(S_random_clus))
  AR_invariant[row(AR_invariant) == col(AR_invariant) - 1] = (1/length(ar.cov.values))*sum(ar.cov.values)
  AR_invariant[row(AR_invariant) == col(AR_invariant) + 1] = (1/length(ar.cov.values))*sum(ar.cov.values)
  
  # corrplot(AR_invariant, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  #          title = "Invariant AR 1 Covariance Structure")
  
  matrix_heatmap(AR_invariant,title = "Invariant AR 1 Covariance Structure")

  
  AR_invariant_list[[c]] = AR_invariant
}

K_AR_invariant = matrix(0,nrow=60,ncol=60)

for(i in 1:num_clus){
  K_AR_invariant = K_AR_invariant + ((1/num_clus)*AR_invariant_list[[i]])
}

# corrplot(K_AR_invariant, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
#            title = "Invariant AR 1 Covariance Structure")

matrix_heatmap(K_AR_invariant,title = "Invariant AR 1 Covariance Structure")

K_AR_cluster = list()
K_AR_periodic_cluster = list()

for (c in 1:num_clus){
  
  #Grab S_random data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_random_clus = cluster_data$S_random
  
  #Create a list to contain covariance matrix for each pollutant (8)
  K_AR_list = list()
  K_AR_periodic_list = list()
  
  rho_AR = 1
  sigma2_AR = 1
  
  time_span = nrow(S_random_clus)
  
  #Calculate a AR 1 covariance matrix for each pollutant and store in list
  for (i in 1:8){
    ts =  S_random_clus[,i]
  
    K_covariate = matrix(nrow=time_span,ncol=time_span)
    K_covariate_periodic = matrix(nrow=time_span,ncol=time_span)
    
    for(j in 1:time_span){
      for (k in 1:time_span){
        if (abs(j-k) <= 1){
          
          K_covariate[j,k] = exp(- ((ts[j] - ts[k])^2) #RBF kernel 
                               / (2*rho_AR)) * sigma2_AR
          
          K_covariate_periodic[j,k] = exp(- ((ts[j] - ts[k])^2) #Locally periodic kernel 
                       / (2*rho_AR)) * exp(- (2*sin((abs(ts[j] - ts[k]))*pi/12)^2)
                       / (rho_AR)) * sigma2_AR
        }
        else{
          K_covariate_periodic[j,k] = 0
          K_covariate[j,k] = 0
          }
      }
    }
    
    K_AR_list[[i]] = K_covariate
    K_AR_periodic_list[[i]] = K_covariate_periodic
  }
  
  names(K_AR_list) = colnames(S_random_clus)
  names(K_AR_periodic_list) = colnames(S_random_clus)
  
  #Add each pollutant's covariance matrix to get AR 1 matrix for each cluster
  K_AR = matrix(0,nrow=60,ncol=60)
  K_AR_periodic = matrix(0,nrow=60,ncol=60)
  
  K_AR_periodic_weights = rep(1,length(K_AR_periodic_list))
  
  for(i in 1:length(K_AR_periodic_list)){
    K_AR = K_AR + ((1/8)*K_AR_list[[i]])
    K_AR_periodic = K_AR_periodic + ((1/8)*K_AR_periodic_list[[i]])
    
    K_AR_periodic_weights[i] = norm(K_AR_periodic_list[[i]],type = "F")
  }
  
  K_AR_periodic_weights = K_AR_periodic_weights / sum(K_AR_periodic_weights)
  print(K_AR_periodic_weights)
  
  K_AR_cluster[[c]] = K_AR
  K_AR_periodic_cluster[[c]] = K_AR_periodic
}
## [1] 0.1180238 0.1257031 0.1279176 0.1208326 0.1290610 0.1318444 0.1254549
## [8] 0.1211627
## [1] 0.1183861 0.1257906 0.1279683 0.1210789 0.1290587 0.1317483 0.1247048
## [8] 0.1212643
## [1] 0.1177953 0.1260674 0.1281143 0.1206671 0.1292772 0.1321721 0.1251538
## [8] 0.1207528
## [1] 0.1179131 0.1258756 0.1279283 0.1207317 0.1295156 0.1320599 0.1251074
## [8] 0.1208684
## [1] 0.1178528 0.1258298 0.1278816 0.1206277 0.1293356 0.1320934 0.1257028
## [8] 0.1206765
## [1] 0.1182129 0.1258859 0.1279207 0.1202214 0.1291226 0.1320892 0.1254036
## [8] 0.1211437
## [1] 0.1183055 0.1254444 0.1268239 0.1206665 0.1289935 0.1315729 0.1272763
## [8] 0.1209169
for (i in 1:num_clus){
  title1 = sprintf("AR 1 Covariance for Cluster %s",i)
  title2 = sprintf("Periodic AR 1 Covariance for Cluster %s",i)
  
  # corrplot(K_AR_cluster[[i]], order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  #        title = title1)
  # corrplot(K_AR_periodic_cluster[[i]], order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  #        title = title2)
  
  matrix_heatmap(K_AR_cluster[[i]],title = title1)
  matrix_heatmap(K_AR_periodic_cluster[[i]],title = title2)
}

Combine each cluster’s AR 1 kernel together:

K_AR = matrix(0,nrow=60,ncol=60)
K_AR_periodic = matrix(0,nrow=60,ncol=60)

K_AR_periodic_weights = rep(1,num_clus)

for(i in 1:num_clus){
  K_AR = K_AR + ((1/num_clus)*K_AR_cluster[[i]])
  K_AR_periodic = K_AR_periodic + ((1/num_clus)*K_AR_periodic_cluster[[i]])
  
  K_AR_periodic_weights[i] = norm(K_AR_cluster[[i]],type = "F")
}

K_AR_periodic_weights = K_AR_periodic_weights / sum(K_AR_periodic_weights)

print(K_AR_periodic_weights)
## [1] 0.1429864 0.1428429 0.1425446 0.1426904 0.1428154 0.1428045 0.1433158
#Heatmap of resulting K 
# corrplot(K_AR, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
#        title = "AR 1 Covariance Structure")
# corrplot(K_AR_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
#        title = "Periodic AR 1 Covariance Structure")

matrix_heatmap(K_AR,title = "AR 1 Covariance Structure")

matrix_heatmap(K_AR_periodic,title = "Periodic AR 1 Covariance Structure")

Calculating distributed lag structure

DL_invariant_list = list()

for (c in 1:num_clus){
  
  #Grab S_DL data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_DL_clus = cluster_data$S_DL
  
  dl3.corr.values = c()
  dl3.cov.values = c()
  
  dl6.corr.values = c()
  dl6.cov.values = c()
  
  dl12.corr.values = c()
  dl12.cov.values = c()
  
  for (i in 1:ncol(S_DL_clus)){
    var = var(S_DL_clus[,i])
    fit.ar = ar(S_DL_clus[,i],order.max = 1, aic = FALSE, method = "yule-walker")
    
    #Fit a parametric AR model for each lag 
    dl3 = arima(S_DL_clus[,i],order = c(3,0,0),seasonal = c(0,0,0),include.mean = FALSE,fixed = c(0,0,NA))
    
    dl6 = arima(S_DL_clus[,i],order = c(6,0,0),seasonal = c(0,0,0),include.mean = FALSE,fixed = c(0,0,0,0,0,NA))
    
    dl12 = arima(S_DL_clus[,i],order = c(12,0,0),seasonal = c(0,0,0),include.mean = FALSE,
                 fixed = c(0,0,0,0,0,0,0,0,0,0,0,NA))
    
    #Calculate correlations and covariances from coefficient estimates
    corr.dl3 = as.numeric(dl3$coef[3])
    cov.dl3 = as.numeric(dl3$coef[3]) * var
    
    corr.dl6 = as.numeric(dl6$coef[6])
    cov.dl6 = as.numeric(dl6$coef[6]) * var
    
    corr.dl12 = as.numeric(dl12$coef[12])
    cov.dl12 = as.numeric(dl12$coef[12]) * var
    
    
    dl3.corr.values = c(dl3.corr.values,corr.dl3)
    dl3.cov.values = c(dl3.cov.values,cov.dl3)
    
    dl6.corr.values = c(dl6.corr.values,corr.dl6)
    dl6.cov.values = c(dl6.cov.values,cov.dl6)
    
    dl12.corr.values = c(dl12.corr.values,corr.dl12)
    dl12.cov.values = c(dl12.cov.values,cov.dl12)
  }
  
  # Run if you want to create a DL_invariant matrix for each pollutant 
  # for (j in 1:ncol(S_DL_clus)){
  #   DL_invariant_covmatrix = diag(nrow(S_DL_clus))
  #   
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 3] = dl3.cov.values[j]
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 3] = dl3.cov.values[j]
  #   
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 6] = dl6.cov.values[j]
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 6] = dl6.cov.values[j]
  #   
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 12] = dl12.cov.values[j]
  #   DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 12] = dl12.cov.values[j]
  #   
  #   # title = sprintf("Covariance of %s",colnames(S_DL_all)[j])
  #   # corrplot(DL_invariant_covmatrix, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  #   #      title = title)
  # }
  
  DL_invariant_covmatrix = diag(nrow(S_DL_clus))
  
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 3] = sum(dl3.cov.values)*(1/8)
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 3] = sum(dl3.cov.values)*(1/8)
  
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 6] = sum(dl6.cov.values)*(1/8)
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 6] = sum(dl6.cov.values)*(1/8)
  
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 12] = sum(dl12.cov.values)*(1/8)
  DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 12] = sum(dl12.cov.values)*(1/8)
  
  DL_invariant_list[[c]] = DL_invariant_covmatrix
  
  # corrplot(DL_invariant_covmatrix, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  #   title = "Invariant DL (3,6,12) Covariance Structure")
  
  matrix_heatmap(DL_invariant_covmatrix,title = "Invariant DL (3,6,12) Covariance Structure")
}

#Combine DL covariance matrices from each cluster together
K_DL_invariant = matrix(0,nrow=60,ncol=60)

for(i in 1:num_clus){
  K_DL_invariant = K_DL_invariant + ((1/num_clus)*DL_invariant_list[[i]])
}

# corrplot(K_DL_invariant, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
#   title = "Invariant DL (3,6,12) Covariance Structure")

matrix_heatmap(K_DL_invariant,title = "Invariant DL (3,6,12) Covariance Structure")

K_DL_cluster = list()
K_DL_periodic_cluster = list()

for (c in 1:num_clus){
  
  #Grab S_DL data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  S_DL_clus = cluster_data$S_DL
  
  #Create a list to store covariance matrix for each DL 
  K_DL_list = list()
  K_DL_periodic_list = list()
  
  dl_lags = c(3,6,12)
  tracker = 1
  
  for (i in dl_lags){
    
    K_DL = matrix(nrow=time_span,ncol=time_span)
    K_DL_periodic = matrix(nrow=time_span,ncol=time_span)
    
    rho_DL = 1
    sigma2_DL = 1
    
    #Calculate DL covariance matrix for specified lag   
    for(j in 1:nrow(S_DL_clus)){
      for (k in 1:nrow(S_DL_clus)){
        
        if ((abs(j-k) == 0) || (abs(j-k) == i)){
          
          K_DL[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2) / (2*rho_DL)) * sigma2_DL
          
          K_DL_periodic[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2)
                               / (2*rho_DL)) * exp(- (2*sin(sum(abs(S_DL_clus[j,] - S_DL_clus[k,]))*pi/12)^2)
                               / (rho_DL)) * sigma2_DL
          
        } 
        else{
          K_DL_periodic[j,k] = 0
          K_DL[j,k] = 0
          }
      }
    }
    
    K_DL_list[[tracker]] = K_DL
    K_DL_periodic_list[[tracker]] = K_DL_periodic
    tracker = tracker+1
  }
  
  #Combine the 3 DL covariance matrices together
  K_DL = matrix(0,nrow=time_span,ncol=time_span)
  K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
  
  K_DL_periodic_weights = rep(1,length(K_DL_periodic_list))
  
  for(i in 1:length(K_DL_periodic_list)){
    K_DL = K_DL + ((1/3)*K_DL_list[[i]])
    K_DL_periodic = K_DL_periodic + ((1/3)*K_DL_periodic_list[[i]])
    
    K_DL_periodic_weights[i] = norm(K_DL_periodic_list[[i]],type = "F")
  }
  
  K_DL_periodic_weights = K_DL_periodic_weights / sum(K_DL_periodic_weights)
  print(K_DL_periodic_weights)
  
  #Store DL(3,6,12) covariance matrix for each cluster 
  K_DL_cluster[[c]] = K_DL
  K_DL_periodic_cluster[[c]] = K_DL
}
## [1] 0.3299808 0.3265991 0.3434201
## [1] 0.3299765 0.3262256 0.3437979
## [1] 0.3332712 0.3269659 0.3397629
## [1] 0.3318032 0.3263415 0.3418554
## [1] 0.3317627 0.3274553 0.3407819
## [1] 0.3309092 0.3280791 0.3410116
## [1] 0.3316097 0.3263056 0.3420847

Combining DL kernels for each cluster together:

K_DL = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)

K_DL_periodic_weights = rep(1,num_clus)

for(i in 1:num_clus){
  K_DL = K_DL + ((1/num_clus)*K_DL_cluster[[i]])
  K_DL_periodic = K_DL_periodic + ((1/num_clus)*K_DL_periodic_cluster[[i]])
  
  K_DL_periodic_weights[i] = norm(K_DL_periodic_cluster[[i]],type = "F")
}

K_DL_periodic_weights = K_DL_periodic_weights / sum(K_DL_periodic_weights)
print(K_DL_periodic_weights)
## [1] 0.1430462 0.1430127 0.1431990 0.1431142 0.1426814 0.1425824 0.1423643
#Heatmap of resulting K 
# corrplot(K_DL, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
#   title = "DL (3,6,12) Covariance Structure")
# corrplot(K_DL_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
#   title = "Periodic DL (3,6,12) Covariance Structure")

matrix_heatmap(K_DL,title = "DL (3,6,12) Covariance Structure")

matrix_heatmap(K_DL_periodic,title = "Periodic DL (3,6,12) Covariance Structure")

Calculating interaction structure

K_Interaction_cluster = list()
K_Interaction_periodic_cluster = list()

for (c in 1:num_clus){
  
  #Grab interaction pair data for cluster c
  cluster_data = decomposed_cluster_data[[c]]
  W2_clus = cluster_data$W2
  
  K_interaction_list = list()
  K_interaction_periodic_list = list()
  
  column_names = colnames(W2_clus)
  time_span = nrow(W2_clus)
  
  #Create sequence of indices corresponding to comparisons for real time and one lag interaction effects
  lag0_idx = seq(2,3601,by=61)
  lag1_idx = seq(1,3600,by=61)
  
  #Calculate a kernel for each interaction pair 
  for (a in 1:length(column_names)){
    interaction =  W2_clus[,a]
    
    #First calculate these two interaction kernels separately 
    K_int0 = matrix(nrow = 60,ncol = 60)
    K_int1 = matrix(nrow = 60,ncol = 60)
    
    K_int0_periodic = matrix(nrow = 60,ncol = 60)
    K_int1_periodic = matrix(nrow = 60,ncol = 60)
    
    rho_int = 1
    sigma2_int = 1
    
    for (i in 1:60){
      for (j in 1:60){
        
        #RBF kernels
        K_int0[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                            / (2*rho_int)) * sigma2_int
  
        K_int1[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                            / (2*rho_int)) * sigma2_int
        
        #Locally periodic kernels 
        K_int0_periodic[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                         / (2*rho_int)) * exp(- (2*sin((abs(interaction[lag0_idx[i]] - interaction[lag0_idx[j]]))*pi/12)^2)
                         / (rho_int)) * sigma2_int
            
        K_int1_periodic[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                             / (2*rho_int)) * exp(- (2*sin((abs(interaction[lag1_idx[i]] - interaction[lag1_idx[j]]))*pi/12)^2)
                             / (rho_int)) * sigma2_int
      }
    }
    
    #Combine real time and one lag interaction kernels together
    K_interaction = 0.5*K_int0 + 0.5*K_int1
    K_interaction_list[[a]] = K_interaction
    
    K_interaction_periodic = 0.5*K_int0_periodic + 0.5*K_int1_periodic
    K_interaction_periodic_list[[a]] = K_interaction_periodic
  }
  
  #Combine kernels for each interaction pair together
  K_interaction = matrix(0,nrow=60,ncol=60)
  K_interaction_periodic = matrix(0,nrow=60,ncol=60)
  
  K_interaction_periodic_weights = rep(1,length(K_interaction_periodic_list))
  
  for(i in 1:length(K_interaction_periodic_list)){
    K_interaction = K_interaction + ((1/length(K_interaction_list))*K_interaction_list[[i]])
    
    K_interaction_periodic = K_interaction_periodic + ((1/length(K_interaction_periodic_list))*K_interaction_periodic_list[[i]])
    
    K_interaction_periodic_weights[i] = norm(K_interaction_periodic_list[[i]],type = "F")
  }
  
  K_interaction_periodic_weights = K_interaction_periodic_weights / sum(K_interaction_periodic_weights)
  print(K_interaction_periodic_weights)
  
  # corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  # title = "Interaction Covariance Structure")
  # corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  # title = "Periodic Interaction Covariance Structure")
  
  matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
  matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")

  
  #Store final interaction kernel (for all pairs) for each cluster 
  K_Interaction_cluster[[c]] = K_interaction
  K_Interaction_periodic_cluster[[c]] = K_interaction_periodic
}
##  [1] 0.01799398 0.01709268 0.01706268 0.01772355 0.01782421 0.01716357
##  [7] 0.01645371 0.01787718 0.01840045 0.01814090 0.01858716 0.01873067
## [13] 0.01855974 0.01795085 0.01713023 0.01852411 0.01792645 0.01821370
## [19] 0.01863021 0.01806931 0.01739713 0.01679001 0.01802977 0.01771089
## [25] 0.01784844 0.01823677 0.01752284 0.01708067 0.01751396 0.01872658
## [31] 0.01839206 0.01796957 0.01865719 0.01782524 0.01768153 0.01756624
## [37] 0.01881072 0.01830463 0.01822472 0.01844018 0.01808511 0.01767478
## [43] 0.01715139 0.01860286 0.01811352 0.01780259 0.01846529 0.01824598
## [49] 0.01734650 0.01610391 0.01809783 0.01723231 0.01735257 0.01775926
## [55] 0.01784222 0.01733938
##  [1] 0.01803884 0.01713361 0.01713705 0.01777179 0.01784107 0.01705886
##  [7] 0.01651129 0.01792731 0.01845711 0.01817844 0.01860422 0.01876322
## [13] 0.01854934 0.01799890 0.01723745 0.01857292 0.01798694 0.01826528
## [19] 0.01866658 0.01795032 0.01749327 0.01683844 0.01804676 0.01771537
## [25] 0.01786577 0.01818352 0.01735629 0.01708117 0.01753902 0.01875002
## [31] 0.01841447 0.01802720 0.01866245 0.01779154 0.01768402 0.01758671
## [37] 0.01884653 0.01831461 0.01819993 0.01846422 0.01800512 0.01768366
## [43] 0.01697388 0.01860367 0.01792730 0.01766821 0.01843175 0.01817202
## [49] 0.01717160 0.01615916 0.01813972 0.01726979 0.01736962 0.01780464
## [55] 0.01784823 0.01725975
##  [1] 0.01797992 0.01699144 0.01697681 0.01777718 0.01784029 0.01701896
##  [7] 0.01637073 0.01785271 0.01844724 0.01818964 0.01863244 0.01878839
## [13] 0.01858867 0.01795503 0.01716656 0.01855519 0.01795217 0.01825505
## [19] 0.01869061 0.01806611 0.01747365 0.01672483 0.01804956 0.01770563
## [25] 0.01792620 0.01829659 0.01756626 0.01704488 0.01747957 0.01876871
## [31] 0.01841102 0.01805965 0.01870195 0.01779189 0.01767880 0.01757292
## [37] 0.01886938 0.01834226 0.01827614 0.01847973 0.01812173 0.01772425
## [43] 0.01687882 0.01864848 0.01813857 0.01779204 0.01842348 0.01829411
## [49] 0.01720398 0.01599703 0.01810676 0.01721032 0.01731120 0.01778245
## [55] 0.01788553 0.01716650
##  [1] 0.01798815 0.01707250 0.01694199 0.01782006 0.01785618 0.01704239
##  [7] 0.01638232 0.01785643 0.01847610 0.01814356 0.01865467 0.01878815
## [13] 0.01861039 0.01794113 0.01707292 0.01860913 0.01798194 0.01830786
## [19] 0.01867758 0.01808722 0.01738208 0.01662694 0.01801765 0.01768447
## [25] 0.01789614 0.01822566 0.01744549 0.01696939 0.01749136 0.01878553
## [31] 0.01847234 0.01805161 0.01874162 0.01786479 0.01769166 0.01758662
## [37] 0.01886923 0.01835239 0.01820703 0.01853056 0.01816756 0.01775636
## [43] 0.01685123 0.01865585 0.01816681 0.01771010 0.01849356 0.01829750
## [49] 0.01716661 0.01599038 0.01810632 0.01725543 0.01723000 0.01782367
## [55] 0.01789775 0.01722766
##  [1] 0.01796224 0.01701220 0.01692662 0.01778606 0.01785974 0.01725829
##  [7] 0.01634749 0.01782948 0.01844827 0.01812726 0.01863357 0.01876688
## [13] 0.01856995 0.01790556 0.01702322 0.01857154 0.01794633 0.01828354
## [19] 0.01866044 0.01823927 0.01734590 0.01661930 0.01800915 0.01769305
## [25] 0.01787700 0.01826233 0.01752535 0.01697313 0.01745899 0.01876345
## [31] 0.01843720 0.01802069 0.01872466 0.01786430 0.01765832 0.01759966
## [37] 0.01884744 0.01833805 0.01824918 0.01850735 0.01824258 0.01776540
## [43] 0.01691454 0.01860446 0.01822114 0.01775709 0.01848905 0.01827943
## [49] 0.01720040 0.01597595 0.01806987 0.01720394 0.01723912 0.01779865
## [55] 0.01789749 0.01740841
##  [1] 0.01801276 0.01703143 0.01694759 0.01778825 0.01788158 0.01721644
##  [7] 0.01642633 0.01788767 0.01844540 0.01811813 0.01863162 0.01880566
## [13] 0.01860215 0.01797847 0.01715008 0.01856778 0.01795322 0.01827947
## [19] 0.01870554 0.01812662 0.01744781 0.01661799 0.01798394 0.01763119
## [25] 0.01778816 0.01815186 0.01738221 0.01692601 0.01747119 0.01877287
## [31] 0.01841553 0.01797934 0.01874718 0.01787089 0.01766895 0.01762439
## [37] 0.01888591 0.01836894 0.01820732 0.01854172 0.01825139 0.01778447
## [43] 0.01684561 0.01862202 0.01808052 0.01764450 0.01849106 0.01829118
## [49] 0.01715437 0.01607265 0.01813384 0.01722352 0.01722589 0.01781774
## [55] 0.01792503 0.01739660
##  [1] 0.01793093 0.01703255 0.01699054 0.01774645 0.01778528 0.01741396
##  [7] 0.01643922 0.01777190 0.01836883 0.01810608 0.01856923 0.01867281
## [13] 0.01857209 0.01786711 0.01691149 0.01852418 0.01780964 0.01816871
## [19] 0.01847682 0.01826081 0.01726716 0.01667661 0.01797483 0.01756242
## [25] 0.01790155 0.01825023 0.01781933 0.01708938 0.01739940 0.01869059
## [31] 0.01833717 0.01800166 0.01861137 0.01802058 0.01764887 0.01751726
## [37] 0.01875468 0.01815959 0.01814963 0.01840496 0.01829976 0.01772545
## [43] 0.01721479 0.01859167 0.01820298 0.01795298 0.01856058 0.01832354
## [49] 0.01750714 0.01607387 0.01805412 0.01725451 0.01733893 0.01776503
## [55] 0.01784731 0.01763145

Combining interaction kernels from each cluster together:

K_interaction = matrix(0,nrow=60,ncol=60)
K_interaction_periodic = matrix(0,nrow=60,ncol=60)

K_interaction_periodic_weights = rep(1,num_clus)

for(i in 1:num_clus){
  K_interaction = K_interaction + ((1/length(K_Interaction_cluster))*K_Interaction_cluster[[i]])
  
  K_interaction_periodic = K_interaction_periodic + ((1/length(K_Interaction_periodic_cluster))*K_Interaction_periodic_cluster[[i]])
  
  K_interaction_periodic_weights[i] = norm(K_Interaction_periodic_cluster[[i]],type = "F")
}

K_interaction_periodic_weights = K_interaction_periodic_weights / sum(K_interaction_periodic_weights)
print(K_interaction_periodic_weights)
## [1] 0.1431007 0.1426961 0.1425407 0.1426358 0.1428563 0.1425086 0.1436618
# corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Interaction Covariance Structure")
# corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Periodic Interaction Covariance Structure")

matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")

matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")

Kernel target alignment calculation between 3 variations of each kernel

F_norm = function(kernel){
  
  total = sum(as.numeric(abs(kernel)^2))
  Fnorm = sqrt(total)
  return(Fnorm)
}

KTA_norm = function(kernel1,kernel2){
  
  centered_k1 = t(diag(nrow(kernel1)) - (1/nrow(kernel1) * t(diag(nrow(kernel1))) %*% diag(nrow(kernel1)))) %*%
    kernel1 %*% diag(nrow(kernel1)) - (1/nrow(kernel1) * t(diag(nrow(kernel1))) %*% diag(nrow(kernel1)))
  
  centered_k2 = t(diag(nrow(kernel2)) - (1/nrow(kernel2) * t(diag(nrow(kernel2))) %*% diag(nrow(kernel2)))) %*%
    kernel2 %*% diag(nrow(kernel2)) - (1/nrow(kernel2) * t(diag(nrow(kernel2))) %*% diag(nrow(kernel2)))
  
  term1 = centered_k1 / (F_norm(centered_k1))
  term2 = centered_k2 / (F_norm(centered_k2))
  term3 = term1 - term2
  term4 = F_norm(term3)
  
  measure = 1 - 0.5*(term4^2)
  return(measure)
}

KTA_rownames = c("AR Invariant-AR RBF", "AR Invariant-AR LP",
                 "AR RBF-AR LP ", "DL Invariant-DL RBF",
                 "DL Invariant-DL LP", "DL RBF-DL LP",
                 "Interaction RBF-Interaction LP")

KTA_table = matrix(c(KTA_norm(K_AR_invariant,K_AR),
                       KTA_norm(K_AR_invariant,K_AR_periodic),
                       KTA_norm(K_AR,K_AR_periodic),
                       KTA_norm(K_DL_invariant,K_DL),
                       KTA_norm(K_DL_invariant,K_DL_periodic),
                       KTA_norm(K_DL,K_DL_periodic),
                       KTA_norm(K_interaction,K_interaction_periodic)),
                   nrow=7)

KTA_table = data.frame(KTA_table)
rownames(KTA_table) = KTA_rownames
colnames(KTA_table) = "Centered KTA via inner product"

KTA_table
##                                Centered KTA via inner product
## AR Invariant-AR RBF                                 0.5993978
## AR Invariant-AR LP                                  0.6105226
## AR RBF-AR LP                                        0.9997569
## DL Invariant-DL RBF                                 0.7920310
## DL Invariant-DL LP                                  0.7920310
## DL RBF-DL LP                                        1.0000000
## Interaction RBF-Interaction LP                      0.9999298

Cleaning and aggregating CalViDa mortality data

Imputing “< 11” values in data with EM algorithm

In the mortality dataset obtained from Cal-ViDa, all of the cells with small values i.e., less than 10 but not equal to 0, were censored. So in order to avoid using a truncated Poisson distribution, we decided to impute these censored values with an EM algorithm which is described below:

For each county and month, we assume that the number of mortality follows a Poisson distribution with rate \(\lambda_i = exp(X_i \beta)\). We will estimate this rate using a Poisson regression model.

Let us split the whole dataset \(D\) into two sets: \(W\) which denotes the set of observations that are censored and \(D \backslash W\) denotes the set of observations that are not censored.

For observed counts, of which there are \(n_{obs}\) of, we know \(P(Y_i = y_i) = \frac{\lambda_i^{y_i}e^{-\lambda_i}}{y_i!}\) for \(i \in D \backslash W\) and for censored counts, of which there are \(n_{cens}\) of, we do not observe \(Z_i\), but we know which observations are censored i.e., the indices which are censored, and that \(Z_i \in \{ 1,...,10 \}\). So we can say that \(P(Z_i \in \{ 1,...,10 \}) = \sum_{j=1}^{10} \frac{\lambda_i^{j}e^{-\lambda_i}}{j!}\) for \(i \in W\).

Thus, the marginal likelihood of our observed mortality data can be written as:

\(L(\beta ; Y_1,...,Y_{n_{obs}},Z_{W_1},...,Z_{W_{n_{cens}}}) = \Pi_{i \in D \backslash W}^{n_{obs}} \frac{\lambda_i^{y_i}e^{-\lambda_i}}{y_i!} * \Pi_{i \in W}^{n_{cens}} \frac{ \frac{\lambda_i^{Z_i} e^{-\lambda_i}}{Z_i!}}{\sum_{j=2}^{10} \frac{\lambda_i^{j}e^{-\lambda_i}}{j!}} * \mathbf{I} \{ Z_i \in \{ 1,...,10 \} \} = \Pi_{i \in D \backslash W}^{n_{obs}} \frac{exp(X_i \beta)_i^{y_i}e^{-exp(X_i \beta)}}{y_i!} * \Pi_{i \in W}^{n_{cens}} \frac{\frac{exp(X_i \beta)^{Z_i} e^{-exp(X_i \beta)}}{Z_i!}}{\sum_{j=2}^{10} \frac{exp(X_i \beta)^{j}e^{-exp(X_i \beta)}}{j!}} * \mathbf{I} \{ Z_i \in \{ 1,...,10 \} \}\)

OR IS IT \(L(\beta ; Y_1,...,Y_{n_{obs}},Z_{W_1},...,Z_{W_{n_{cens}}}) = \Pi_{i \in D \backslash W}^{n_{obs}} \frac{\lambda_i^{y_i}e^{-\lambda_i}}{y_i!} * \Pi_{i \in W}^{n_{cens}} \sum_{j=1}^{10} (\frac{\lambda_i^{j}e^{-\lambda_i}}{j!}) = \Pi_{i \in D \backslash W}^{n_{obs}} \frac{(X_i \beta)_i^{y_i}e^{-X_i \beta}}{y_i!} * \Pi_{i \in W}^{n_{cens}} \sum_{j=1}^{10} (\frac{(X_i \beta)^{j}e^{-X_i \beta}}{j!})\)

Note that we include the denominator in the truncated Poisson term in order to normalize the density, notice that this does not depend on \(Z_i\).

Given the nature of this complicated marginal likelihood, we will use the EM algorithm to get maximum likelihood estimates (MLEs) of the unknown values \(\beta\).

The full likelihood of \(\overrightarrow Y,\overrightarrow Z\) is given by:

\(L(\beta;\overrightarrow Y,\overrightarrow Z) = \Pi_{i \in D \backslash W}^{n_{obs}} \frac{exp(X_i \beta)_i^{y_i}e^{-exp(X_i \beta)}}{y_i!} * \Pi_{i \in W}^{n_{cens}} \frac{\frac{exp(X_i \beta)^{Z_i} e^{-exp(X_i \beta)}}{Z_i!}}{\sum_{j=2}^{10} \frac{exp(X_i \beta)^{j}e^{-exp(X_i \beta)}}{j!}} * \mathbf{I} \{ Z_i \in \{ 1,...,10 \} \}\) where \(P(\overrightarrow W | \overrightarrow Z) = 1\)

OR IS IT \(L(\beta ; \overrightarrow Y,\overrightarrow Z) = \Pi_{i \in D \backslash W}^{n_{obs}} \frac{(X_i \beta)_i^{y_i}e^{-X_i \beta}}{y_i!} * \Pi_{i \in W}^{n_{cens}} [ \;\sum_{j=1}^{10} (\frac{(X_i \beta)^{j}e^{-X_i \beta}}{j!}) \; ] * I \{ Z_i \in \{ 1,...,10 \} \}\)

The corresponding log likelihood is given by:

\(log \; L(\beta; \overrightarrow Y,\overrightarrow Z) = \sum_{i \in D \backslash W}^{n_{obs}} [Y_i X_i \beta - exp(X_i \beta) - log(Y_i!)] + \sum_{i \in W}^{n_{cens}} [Z_i X_i \beta - exp(X_i \beta) - log(Z_i!)] - log(\sum_{j=2}^{10} \frac{(X_i \beta)^{j}e^{-exp(X_i \beta)}}{j!})\)

OR IS IT \(log \; L(\beta; \overrightarrow Y,\overrightarrow Z) \propto \sum_{i \in D \backslash W}^{n_{obs}} [Y_i X_i \beta - exp(X_i \beta) - log(Y_i!)] + \sum_{i \in W}^{n_{cens}} [Z_i X_i \beta - exp(X_i \beta) - log(Z_i!)]\)

We can exclude the denominator of the second term because it is a constant wrt \(Z\)

In other words, \((Z_i | ,\beta^{(t)}) \sim TruncPois(exp(X_i \beta))\) for \(i \in W\)

E-step: Finding the expected value of the log likelihood function of \(\beta\)

Let \(Q(\beta | \beta^{(t)}) := E_{Z \sim (. | \overrightarrow Y,\beta^{(t)})} [log \; p(\overrightarrow Y,\overrightarrow Z | \beta))]\)

i.e. the expected value of the log likelihood function of \(\beta\), with respect to the current conditional distribution of \(Z\) \(p(Z | \overrightarrow Y,\beta^{(t)})\)

So in our case,

\(Q(\beta | \beta^{(t)}) = \sum_{i \in D \backslash W}^{n_{obs}} [Y_i X_i \beta - exp(X_i \beta) - log(Y_i!)] + \sum_{i \in W}^{n_{cens}} [E[Z_i | \overrightarrow W_i,\beta^{(t)})] \; X_i \beta - exp(X_i \beta) - E[log(Z_i!) | \overrightarrow W_i,\beta^{(t)}]]\)

M-step: Finding the parameters that maximize \(Q(\beta | \beta^{(t)})\)

\(\beta^{(t+1)} = arg \; max_{\beta} Q(\beta | \beta^{(t)})\)

So in our case, we need to maximize \(Q(\beta | \beta^{(t)})\).

\(\beta^{(t+1)} = arg \; max_{\beta} \sum_{i \in D \backslash W}^{n_{obs}} [Y_i X_i \beta - exp(X_i \beta) - log(Y_i!)] + \sum_{i \in W}^{n_{cens}} [E[Z_i | \overrightarrow W_i,\beta^{(t)})] \; X_i \beta - exp(X_i \beta) - E[log(Z_i! ) | \overrightarrow W_i,\beta^{(t)}]]\)

This is equivalent to fitting a Poisson regression in which the outcomes are \(Y_i\) for \(i \leq n_{obs}\) and censored values \(Z_i\) by \(E[Z_i | \overrightarrow W,\beta^{(t)})]\) for \(i \in W\).

OR alternatively, we can take the first and second derivative and apply a Newton-Raphson procedure if we want to solve for \(\beta\) numerically,

We coded this algorithm as follows:

First, we needed to get an initial estimate of our \(\beta\) coefficients in our Poisson regression model. We included age group, county of death, cause of death (either influenza+pneumonia OR chronic lower respiratory disease), and month of death as covariates.

To do this, we needed to perform an initial imputation to get a complete dataset to fit a model on. We decided to do this by making a crude estimation of the rate per 100,000 people \(\lambda_i\). To do this, we calculated a population weighted mean of the number of deaths across all ages and months. However, we only had observed populations at the county level, not for each specific age group included in the mortality data. So using census data which told us the approximate populations for specific age groups (for all of California), we were able to calculate approximate population sizes for each of the age categories included in the mortality data. See death_byage2 for reference. Then using the ratio of a given county’s population relative to the entire population of California, we were able to calculate approximations for the population size of each age group for each county in our mortality dataset. These served as the weights for our population weighted average of the rate of respiratory deaths in California.

population_age = read_xlsx("Population Categories.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
#head(population_age)

population_age = population_age[-(1:5),2]
population_age$...2 = as.numeric(population_age$...2)

#split under 5 category into < 1 and 1-4 years old 
less1 = floor(population_age$...2[1]*0.2)
onefour = floor(population_age$...2[1]*0.8)
death_byage = population_age$...2[-1]
death_byage = c(less1,onefour,death_byage)
death_byage = death_byage[1:19]

death_byage2 = death_byage[1:2]
idx = seq(from = 3, to = 17, by = 2)
for (i in idx){
  death_byage2 = c(death_byage2,(death_byage[i]+death_byage[i+1]))
}
death_byage2 = c(death_byage2,death_byage[19])


age_groups = unique(mortality$Age)
death_byage2 = data.frame(cbind(age_groups,death_byage2))
colnames(death_byage2) = c("Age_Group","Population_by_Age")
death_byage2$Population_by_Age = as.numeric(death_byage2$Population_by_Age)
#head(death_byage2)

#2010-2019 population data for CA 
USpops = read.csv("CA_census_pops1019.csv")
CApops = USpops %>% filter(STNAME == "California") %>% select(CTYNAME,POPESTIMATE2019)
counties = countycodes$value_represented #from EPA data file

weights = CApops[(2:59),2]
weights = weights/CApops[1,2]

groups = unique(mortality$Age)
step1 = 1
step2 = 1

for (i in counties){
  for (j in groups){
    idx = which(mortality$Age == j & mortality$County_of_Death == i)
    mortality$Population[idx] = ceiling(death_byage2$Population[step1]*weights[step2])
    step1 = step1+1
  }  
  step1 = 1
  step2 = step2+1
}

mortality$logpop = log(mortality$Population)

censored_idx = which(mortality$Total_Deaths == "<11")
censorTF = mortality$Total_Deaths == "<11"
mortality = cbind(mortality,censorTF)
#head(mortality)

GETTING INITIAL GUESS FOR LAMBDA: AVG DEATHS (PER 100K PEOPLE) PER MONTH FOR ONE COUNTY

uncensored_mortality = mortality %>% filter(censorTF == FALSE) %>% select(Total_Deaths,Population)
uncensored_mortality$Total_Deaths = as.numeric(uncensored_mortality$Total_Deaths)

theta = mean(uncensored_mortality$Total_Deaths*100000/uncensored_mortality$Population)

By using all the data, I obtained a crude initial guess for \(\lambda\) of about 1.08. Using this initial estimate \(\lambda\), we calculated the expected value for each \(Z_i\) to get an initial imputed dataset. This dataset will be used to estimate a Poisson regression model which will give us our initial value for our actual parameters of interest \(\beta\).

FUNCTION FOR IMPUTING CENSORED VALUE BASED ON EXPECTATION GIVEN LAMBDA

impute_small_values = function(lambda){
  x = 1:10
  p = dpois(x,lambda)
  
  value = sum(x*p)/sum(p)
  return(value)
}

INITIAL IMPUTATION:

mortality2 = mortality
mortality2$Total_Deaths[censored_idx] = 0.01
mortality2$Total_Deaths = as.numeric(mortality2$Total_Deaths)

for (i in censored_idx){
  lambda = theta*mortality2$Population[i] / 100000
  deaths = impute_small_values(lambda)
  
  mortality2$Total_Deaths[i] = floor(deaths)
}

INITIAL REGRESSION MODELS:

mortality2$Age = factor(mortality2$Age)
mortality2$Cause_of_Death = factor(mortality2$Cause_of_Death)
mortality2$Month = factor(mortality2$Month)

pois_reg = glm(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), family = "poisson", data = mortality2)
# ZIP_reg = zeroinfl(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop) | 1, data = mortality2, dist = "poisson", link = "logit")

vec0 = coef(pois_reg)
# vec0 = coef(ZIP_reg)

summary(pois_reg)
## 
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), 
##     family = "poisson", data = mortality2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4252  -0.5207  -0.2124  -0.0650  14.5038  
## 
## Coefficients:
##                                         Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           -13.744433   0.087789 -156.562  < 2e-16
## Age15 - 24 years                        0.568750   0.095519    5.954 2.61e-09
## Age25 - 34 years                        1.081482   0.091812   11.779  < 2e-16
## Age35 - 44 years                        1.667882   0.090131   18.505  < 2e-16
## Age45 - 54 years                        2.381811   0.088827   26.814  < 2e-16
## Age5 - 14 years                         0.236541   0.099117    2.386    0.017
## Age55 - 64 years                        3.136398   0.088079   35.609  < 2e-16
## Age65 - 74 years                        4.183664   0.087714   47.697  < 2e-16
## Age75 - 84 years                        5.352522   0.087582   61.115  < 2e-16
## Age85 years and over                    6.499169   0.087521   74.258  < 2e-16
## AgeLess than 1 year                     0.965400   0.138658    6.962 3.34e-12
## Cause_of_DeathInfluenza and pneumonia  -0.751349   0.007035 -106.798  < 2e-16
## Month2                                 -0.253385   0.013643  -18.573  < 2e-16
## Month3                                 -0.251082   0.013634  -18.416  < 2e-16
## Month4                                 -0.450337   0.014455  -31.155  < 2e-16
## Month5                                 -0.522552   0.014781  -35.352  < 2e-16
## Month6                                 -0.646664   0.015382  -42.040  < 2e-16
## Month7                                 -0.661526   0.015458  -42.796  < 2e-16
## Month8                                 -0.709630   0.015707  -45.180  < 2e-16
## Month9                                 -0.756347   0.015957  -47.400  < 2e-16
## Month10                                -0.651645   0.015407  -42.294  < 2e-16
## Month11                                -0.617588   0.015237  -40.532  < 2e-16
## Month12                                -0.357779   0.014059  -25.449  < 2e-16
##                                          
## (Intercept)                           ***
## Age15 - 24 years                      ***
## Age25 - 34 years                      ***
## Age35 - 44 years                      ***
## Age45 - 54 years                      ***
## Age5 - 14 years                       *  
## Age55 - 64 years                      ***
## Age65 - 74 years                      ***
## Age75 - 84 years                      ***
## Age85 years and over                  ***
## AgeLess than 1 year                   ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2                                ***
## Month3                                ***
## Month4                                ***
## Month5                                ***
## Month6                                ***
## Month7                                ***
## Month8                                ***
## Month9                                ***
## Month10                               ***
## Month11                               ***
## Month12                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 382636  on 91871  degrees of freedom
## Residual deviance:  74194  on 91849  degrees of freedom
## AIC: 126734
## 
## Number of Fisher Scoring iterations: 7
# summary(ZIP_reg)

Now that we have initialized our parameters, \(\beta^{(0)}\), we can proceed with the EM algorithm until our parameters (the coefficients of our regression model), converge.

The main steps implemented in the chunk below are:

  1. Given a newly fitted Poisson regression model with parameter values \(\beta^{(t)}\), take its fitted values for the \(\lambda\)’s corresponding to observations that were censored in the original mortality dataset

  2. Use those fitted \(\lambda\)’s, calculate the expected value of our unknown values \(Z\)

  3. Once all \(Z_i\)’s are imputed, we can use the now complete dataset to estimate the Poisson regression model again, which will produce the maximum likelihood estimate of our parameters \(\beta\), these are our new values \(\beta^{(t+1)}\).

  4. Compare the difference between our new \(\beta\) coefficient estimates with those from the previous iteration and either perform another iteration or stop the algorithm if the maximum difference between coefficients from different iterations is less than 0.01.

Note: I experimented with a ZIP regression model as well but the log likelihood values at each iteration were generally higher for the Poisson regression model

mortality3 = mortality2
model = pois_reg

model_diff = 100
iter = 1
vec0 = coef(model)

while((model_diff > 0.01) & (iter < 10)){
  
  #impute data (should be between 1-10)
  fvs = fitted.values(model)
  
  for (i in censored_idx){
    deaths = impute_small_values(fvs[i])
    mortality3$Total_Deaths[i] = floor(deaths)
  }
  
  #fit model on "new" data
  model = glm(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), family = "poisson", data = mortality3)
  vec1 = coef(model)
  
  model_diff = max(abs(vec1 - vec0))
  iter = iter+1
  vec0 = vec1
}

final_pois_reg = model
summary(final_pois_reg)
## 
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), 
##     family = "poisson", data = mortality3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9268  -0.3514  -0.1387  -0.0449  12.3014  
## 
## Coefficients:
##                                         Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           -14.351349   0.116534 -123.152  < 2e-16
## Age15 - 24 years                       -0.054190   0.135792   -0.399 0.689845
## Age25 - 34 years                        0.430115   0.127371    3.377 0.000733
## Age35 - 44 years                        1.210281   0.122028    9.918  < 2e-16
## Age45 - 54 years                        2.345599   0.118274   19.832  < 2e-16
## Age5 - 14 years                        -0.455007   0.145826   -3.120 0.001807
## Age55 - 64 years                        3.744079   0.116771   32.063  < 2e-16
## Age65 - 74 years                        5.001304   0.116457   42.946  < 2e-16
## Age75 - 84 years                        6.168765   0.116379   53.006  < 2e-16
## Age85 years and over                    7.285988   0.116346   62.623  < 2e-16
## AgeLess than 1 year                     1.536529   0.158560    9.691  < 2e-16
## Cause_of_DeathInfluenza and pneumonia  -0.719013   0.006477 -111.013  < 2e-16
## Month2                                 -0.245579   0.012764  -19.240  < 2e-16
## Month3                                 -0.234760   0.012726  -18.448  < 2e-16
## Month4                                 -0.416647   0.013416  -31.057  < 2e-16
## Month5                                 -0.488946   0.013715  -35.649  < 2e-16
## Month6                                 -0.603208   0.014221  -42.418  < 2e-16
## Month7                                 -0.631446   0.014352  -43.998  < 2e-16
## Month8                                 -0.672621   0.014547  -46.237  < 2e-16
## Month9                                 -0.708569   0.014723  -48.128  < 2e-16
## Month10                                -0.618620   0.014292  -43.285  < 2e-16
## Month11                                -0.580590   0.014118  -41.125  < 2e-16
## Month12                                -0.337932   0.013106  -25.784  < 2e-16
##                                          
## (Intercept)                           ***
## Age15 - 24 years                         
## Age25 - 34 years                      ***
## Age35 - 44 years                      ***
## Age45 - 54 years                      ***
## Age5 - 14 years                       ** 
## Age55 - 64 years                      ***
## Age65 - 74 years                      ***
## Age75 - 84 years                      ***
## Age85 years and over                  ***
## AgeLess than 1 year                   ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2                                ***
## Month3                                ***
## Month4                                ***
## Month5                                ***
## Month6                                ***
## Month7                                ***
## Month8                                ***
## Month9                                ***
## Month10                               ***
## Month11                               ***
## Month12                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 455249  on 91871  degrees of freedom
## Residual deviance:  38553  on 91849  degrees of freedom
## AIC: 97178
## 
## Number of Fisher Scoring iterations: 8
logLik(final_pois_reg)
## 'log Lik.' -48566.18 (df=23)

To solve for the fixed point solutions for \(\beta\) numerically, we would first take the derivative of \(Q()\) wrt each \(\beta_j\) which gives us

\(\frac{dQ}{d\beta_j} = \sum_{i \in D \backslash W}^{n_{obs}}[y_i - exp(\alpha_i + x_i \beta)]x_{ij} + \sum_{i \in W}^N [\tilde y_i - exp(\alpha_i + x_i \beta)]x_{ij}\) where \(\alpha_i\) represents the offset associated with each observation \(i\) and \(\tilde y_i\) represents the imputed values corresponding to censored observation \(Z_i\)

Rewriting this in vector form (since we have 3 covariates, which are all categorical) we obtain the following gradient:

\(f'(\beta) = \frac{dQ}{d \overrightarrow \beta} = X^T [\overrightarrow y - exp(\overrightarrow \alpha + X \beta)]\) where first term is a \(p\times n\) matrix and second term is a \(n\times 1\) vector

Then, we take another derivative to get the Hessian:

\(f''_{jj'}(\beta) = -\sum_{i=1}^N exp(\alpha_i + x_i \beta) X_{ij} X_{ij'} = -X^T diag(exp(\overrightarrow \alpha + X \beta)) X\)

Finally, we solve for the next value for our \(\beta\)s using these two values with the following equation:

\(\beta^{(b)} = \beta^{(b-1)} - [(f''(\beta^{(b-1)}))^{-1} f'(\beta^{(b-1)})]\)

Now we need to code up a Newton Raphson function and initialize it:

Y = mortality2$Total_Deaths

#X is design matrix w col of 1s then each level of each categorical predictors except their baselines 
intercept = rep(1,length(Y))

#Age categories 
Age1524 = as.numeric(mortality2$Age == "15 - 24 years")
Age2534 = as.numeric(mortality2$Age == "25 - 34 years")
Age3544 = as.numeric(mortality2$Age == "35 - 44 years")
Age4554 = as.numeric(mortality2$Age == "45 - 54 years")
Age514 = as.numeric(mortality2$Age == "5 - 14 years")
Age5564 = as.numeric(mortality2$Age == "55 - 64 years")
Age6574 = as.numeric(mortality2$Age == "65 - 74 years")
Age7584 = as.numeric(mortality2$Age == "75 - 84 years")
Age85 = as.numeric(mortality2$Age == "85 years and over")
Age1 = as.numeric(mortality2$Age == "Less than 1 year")

#Cause of death categories 
Cause2 = as.numeric(mortality2$Cause_of_Death == "Influenza and pneumonia")

#Month categories 
Month2 = as.numeric(mortality2$Month == 2)
Month3 = as.numeric(mortality2$Month == 3)
Month4 = as.numeric(mortality2$Month == 4)
Month5 = as.numeric(mortality2$Month == 5)
Month6 = as.numeric(mortality2$Month == 6)
Month7 = as.numeric(mortality2$Month == 7)
Month8 = as.numeric(mortality2$Month == 8)
Month9 = as.numeric(mortality2$Month == 9)
Month10 = as.numeric(mortality2$Month == 10)
Month11 = as.numeric(mortality2$Month == 11)
Month12 = as.numeric(mortality2$Month == 12)

X = cbind(intercept,Age1524,Age2534,Age3544,Age4554,Age514,Age5564,Age6574,Age7584,Age85,Age1,
          Cause2,Month2,Month3,Month4,Month5,Month6,Month7,Month8,Month9,Month10,
          Month11,Month12)
# dim(X)

offset_vec = offset(mortality2$logpop)
offset_vec = matrix(offset_vec,ncol=1)

#Initial guesses for beta
B = coef(pois_reg)
B = matrix(B,ncol=1)

#Define first derivative of Q function
f_gradient = function(Y,X,B){
  value = t(X) %*% (Y - exp(X %*% B + offset_vec))
  return(value)
}

# f_gradient(Y,X,B)

#Define second derivative of Q function
f_hessian = function(Y,X,B){
  middle = as.numeric(exp(X %*% B + offset_vec))
  X2 = X
  
  for (i in 1:length(middle)){
    X2[i,] = X[i,] * middle[i]
  }
  
  value = -t(X) %*% X2
  
  return(value)
}

# dim(f_hessian(Y,X,B))

#Define Newton Raphson function and compute initial beta coefficient estimates 
Newton_Raphson = function(Y,X,x0,tol = 0.001,eps = 0.01,max_iter = 100){
  for (i in 1:max_iter){
    g = f_gradient(Y,X,x0)
    h = f_hessian(Y,X,x0)
    value = abs(det(h))
    
    if (value < eps){
      break
    }
    
    x1 = x0 - (solve(h) %*% g)
    # x1 = x0 - (solve(h) %*% g * (0.01 * 0.999^i)) #gradient descent is too large at each iteration so need to slow it down
    
    if (max(abs(x1-x0)) <= tol){
      return(x1)
    }
    
    x0 = x1
  }
  
  return(x0)
}

#Initial beta coefficient estimates 
new_coefs = Newton_Raphson(Y,X,B)

Similar to above, now that we have initialized our parameters, \(\beta^{(0)}\), we can proceed with the EM algorithm until our parameters (the coefficients of our regression model), converge.

The main steps implemented in the chunk below are:

  1. Given newly estimated parameter values \(\beta^{(t)}\) from the Newton-Raphson procedure above, take its fitted values for the \(\lambda\)’s corresponding to observations that were censored in the original mortality dataset

  2. Use those fitted \(\lambda\)’s, calculate the expected value of our unknown values \(Z\)

  3. Once all \(Z_i\)’s are imputed, we can use the now complete dataset to estimate the beta coefficients with Newton-Raphson again, which will produce the maximum likelihood estimate of our parameters \(\beta\), these are our new values \(\beta^{(t+1)}\).

  4. Compare the difference between our new \(\beta\) coefficient estimates with those from the previous iteration and either perform another iteration or stop the algorithm if the maximum difference between coefficients from different iterations is less than 0.01.

while((model_diff > 0.01) & (iter < 10)){
  
  #impute data (should be between 1-10)
  fvs_NR = exp((X %*% new_coefs) + offset_vec)
  
  for (i in censored_idx){
    deaths = impute_small_values(fvs_NR[i])
    Y[i] = floor(deaths)
  }
  
  new_coefs2 = Newton_Raphson(Y,X,new_coefs)
  
  model_diff = max(abs(new_coefs2 - new_coefs))
  new_coefs = new_coefs2
  iter = iter+1
  vec0 = vec1
}

new_coefs
##                  [,1]
## intercept -13.7444329
## Age1524     0.5687496
## Age2534     1.0814825
## Age3544     1.6678819
## Age4554     2.3818112
## Age514      0.2365414
## Age5564     3.1363978
## Age6574     4.1836642
## Age7584     5.3525223
## Age85       6.4991688
## Age1        0.9653963
## Cause2     -0.7513495
## Month2     -0.2533848
## Month3     -0.2510821
## Month4     -0.4503366
## Month5     -0.5225520
## Month6     -0.6466641
## Month7     -0.6615256
## Month8     -0.7096304
## Month9     -0.7563473
## Month10    -0.6516455
## Month11    -0.6175879
## Month12    -0.3577794
summary(final_pois_reg)
## 
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), 
##     family = "poisson", data = mortality3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9268  -0.3514  -0.1387  -0.0449  12.3014  
## 
## Coefficients:
##                                         Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                           -14.351349   0.116534 -123.152  < 2e-16
## Age15 - 24 years                       -0.054190   0.135792   -0.399 0.689845
## Age25 - 34 years                        0.430115   0.127371    3.377 0.000733
## Age35 - 44 years                        1.210281   0.122028    9.918  < 2e-16
## Age45 - 54 years                        2.345599   0.118274   19.832  < 2e-16
## Age5 - 14 years                        -0.455007   0.145826   -3.120 0.001807
## Age55 - 64 years                        3.744079   0.116771   32.063  < 2e-16
## Age65 - 74 years                        5.001304   0.116457   42.946  < 2e-16
## Age75 - 84 years                        6.168765   0.116379   53.006  < 2e-16
## Age85 years and over                    7.285988   0.116346   62.623  < 2e-16
## AgeLess than 1 year                     1.536529   0.158560    9.691  < 2e-16
## Cause_of_DeathInfluenza and pneumonia  -0.719013   0.006477 -111.013  < 2e-16
## Month2                                 -0.245579   0.012764  -19.240  < 2e-16
## Month3                                 -0.234760   0.012726  -18.448  < 2e-16
## Month4                                 -0.416647   0.013416  -31.057  < 2e-16
## Month5                                 -0.488946   0.013715  -35.649  < 2e-16
## Month6                                 -0.603208   0.014221  -42.418  < 2e-16
## Month7                                 -0.631446   0.014352  -43.998  < 2e-16
## Month8                                 -0.672621   0.014547  -46.237  < 2e-16
## Month9                                 -0.708569   0.014723  -48.128  < 2e-16
## Month10                                -0.618620   0.014292  -43.285  < 2e-16
## Month11                                -0.580590   0.014118  -41.125  < 2e-16
## Month12                                -0.337932   0.013106  -25.784  < 2e-16
##                                          
## (Intercept)                           ***
## Age15 - 24 years                         
## Age25 - 34 years                      ***
## Age35 - 44 years                      ***
## Age45 - 54 years                      ***
## Age5 - 14 years                       ** 
## Age55 - 64 years                      ***
## Age65 - 74 years                      ***
## Age75 - 84 years                      ***
## Age85 years and over                  ***
## AgeLess than 1 year                   ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2                                ***
## Month3                                ***
## Month4                                ***
## Month5                                ***
## Month6                                ***
## Month7                                ***
## Month8                                ***
## Month9                                ***
## Month10                               ***
## Month11                               ***
## Month12                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 455249  on 91871  degrees of freedom
## Residual deviance:  38553  on 91849  degrees of freedom
## AIC: 97178
## 
## Number of Fisher Scoring iterations: 8

NEWTON RAPHSON APPROACH DOES NOT WORK WELL BC AT EACH ITERATION VALUES ARE CHANGING BY TOO MUCH, LEADS TO HESSIAN MATRIX BEING UNINVERTIBLE

Now that we have imputed the censored “< 11” values in the Cal-ViDa dataset, we will now aggregate the data to get total number of respiratory related deaths for each county for every month between 2014-2019, except for Sept-Dec 2014 which are unavailable for some reason.

Empirical 5 number summaries for each variable of interest

print("Summary for respiratory related mortality")
## [1] "Summary for respiratory related mortality"
summary(mortality3$Total_Deaths) #summary of deaths per month, for all counties i.e. all of CA, age groups, and months 2014-2019
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.178   0.000 209.000
print("Summary for EPA data")
## [1] "Summary for EPA data"
for (i in pollutants$parametercodes.code){
  data = final_EPA_data %>% filter(Pollutant == i) 
  print(summary(data$Value)) #summary of values for each pollutant, for all counties i.e. all of CA, all months (2014-2019)
  AQI = data$AQI
}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0120  0.0140  0.0160  0.0184  0.0190  0.0655 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2292  0.3000  0.3458  0.3719  0.4333  0.6500 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2636  0.4864  0.6545  0.6448  0.7773  1.1318 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.402   4.939   7.139   8.351  11.071  21.668 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.008118 0.020000 0.026412 0.025109 0.028948 0.037000 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   13.00   18.00   20.09   25.00   70.50 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.200   6.700   7.800   8.546   9.600  23.350
summary(AQI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   33.00   36.00   38.34   40.50   75.00
print("Summary for SDI data")
## [1] "Summary for SDI data"
summary(soa.data$Score) #summary of SDI score, for all counties i.e. all of CA, all years 2010-2019
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.38   93.45  106.95  112.53  131.69  183.67

Reformatting the data set (mainly aggregating)

First, we wanted to combine the number of deaths from the two different causes into a total number of respiratory related deaths for each age group. Then, we combined the total number of deaths for each age group for a given month and county in a singular total for that month and county.

#Aggregating by  cause of death
data = mortality3

data1 = data %>% filter(Cause_of_Death == "Chronic lower respiratory diseases")
data2 = data %>% filter(Cause_of_Death == "Influenza and pneumonia")

newdeaths = data1$Total_Deaths + data2$Total_Deaths
data1$Total_Deaths = newdeaths

respmortality = data1[,-5]


#Creates total deaths by adding deaths of all age groups together 
agg.respmortality = respmortality[1,]
agg.respmortality$Age = as.character(agg.respmortality$Age)
rows2 = seq(1,nrow(respmortality),11)

for (i in rows2){
  agg.respmortality[i,] = respmortality[i,]
  agg.respmortality$Total_Deaths[i] = sum(respmortality$Total_Deaths[i:(i+10)])
  agg.respmortality$Age[i] = "Everyone"
}

agg.respmortality = na.omit(agg.respmortality)
rownames(agg.respmortality) = NULL
total.respmortality = agg.respmortality[,-c(6:9)]

Reformat dataset into time series format (rows are counties, columns are months)

Total deaths:

months = unique(total.respmortality$Month_of_Death)
years = sort(unique(total.respmortality$Year_of_Death))
counties = unique(total.respmortality$County_of_Death)
x = 0

total.mortality.ts = matrix(1,nrow = 58, ncol = 72)

for (k in counties){
  county.ts = c()
  x = x+1
  
  for (i in years){
    for (j in months){
      deaths = total.respmortality %>% filter(County_of_Death == k) %>%  filter(Year_of_Death == i) %>% filter(Month_of_Death == j) %>% select(Total_Deaths) %>% as.numeric()
      county.ts = c(county.ts,deaths)
    }
  } 
  
  total.mortality.ts[x,] = county.ts
}

#Label time series data 
total.mortality.ts = as.data.frame(total.mortality.ts)

dates = c()
x=1
for (i in years){
  for (j in months){
    dates[x] = sprintf("%1.0f/%1.0f",j,i)
    x = x+1
  }
}

colnames(total.mortality.ts) = dates
ID = c(1:58)
total.mortality.ts = cbind(ID,counties,total.mortality.ts)

total.mortality.ts = left_join(clusterlabels,total.mortality.ts,by = "counties")
head(total.mortality.ts)
##    counties Cluster ID 1/2014 2/2014 3/2014 4/2014 5/2014 6/2014 7/2014 8/2014
## 1   Alameda       4  1     84     61     57     57     55     44     44     44
## 2    Alpine       5  2      0      0      0      0      0      0      0      0
## 3    Amador       5  3      2      3      2      2      2      1      3      1
## 4     Butte       1  4     15     10     11      8      5      6      7      6
## 5 Calaveras       5  5      3      4      2      3      0      0      1      0
## 6    Colusa       6  6      0      1      1      0      1      0      0      0
##   9/2014 10/2014 11/2014 12/2014 1/2015 2/2015 3/2015 4/2015 5/2015 6/2015
## 1     42      51      50      62    101     72     63     61     63     46
## 2      0       0       0       0      0      0      0      0      0      0
## 3      1       3       2       1      3      3      3      5      4      4
## 4      7       5       7       9     13      8     10     19      6      8
## 5      1       2       1       1      2      0      2      2      1      2
## 6      1       0       0       0      1      2      0      1      0      0
##   7/2015 8/2015 9/2015 10/2015 11/2015 12/2015 1/2016 2/2016 3/2016 4/2016
## 1     44     40     36      41      41      64     78     56     62     55
## 2      0      0      0       0       0       0      0      0      0      0
## 3      2      2      2       2       2       4      3      3      2      2
## 4      6     10      3       7       8       6     13      6      8      7
## 5      2      1      2       0       1       4      3      2      4      2
## 6      0      0      1       1       1       2      1      0      1      0
##   5/2016 6/2016 7/2016 8/2016 9/2016 10/2016 11/2016 12/2016 1/2017 2/2017
## 1     39     44     47     43     37      53      48      60    114     66
## 2      0      0      0      0      0       1       0       0      0      0
## 3      0      0      1      1      2       0       2       2      4      2
## 4     18      4      4      6      7       6       8       8     11     11
## 5      3      2      3      2      1       0       0       3      2      1
## 6      0      0      0      0      0       0       0       2      0      1
##   3/2017 4/2017 5/2017 6/2017 7/2017 8/2017 9/2017 10/2017 11/2017 12/2017
## 1     62     59     45     45     48     37     41      40      44      62
## 2      0      0      0      0      0      0      0       0       0       0
## 3      1      3      3      3      4      2      2       1       1       2
## 4     11     10      8     10      6      8      5      10       7       9
## 5      2      4      2      2      1      2      1       0       0       3
## 6      0      0      0      1      0      1      0       0       0       1
##   1/2018 2/2018 3/2018 4/2018 5/2018 6/2018 7/2018 8/2018 9/2018 10/2018
## 1    114     58     62     52     50     46     49     39     39      49
## 2      0      0      0      0      0      0      0      0      0       0
## 3      6      2      2      0      2      3      1      1      2       3
## 4     13     10     10      9      8      7      6      8      7       8
## 5      3      4      3      2      2      2      1      1      5       3
## 6      0      0      1      1      2      0      0      0      1       0
##   11/2018 12/2018 1/2019 2/2019 3/2019 4/2019 5/2019 6/2019 7/2019 8/2019
## 1      41      49     60     61     56     60     38     43     48     45
## 2       1       0      0      0      0      0      0      0      0      0
## 3       3       2      4      2      2      4      0      1      3      0
## 4       8       6     10      8      8      7      7      8      6      6
## 5       2       1      3      3      2      1      1      4      0      2
## 6       0       0      2      0      0      1      1      1      0      1
##   9/2019 10/2019 11/2019 12/2019
## 1     38      44      47      68
## 2      0       0       0       0
## 3      2       1       2       3
## 4      5       5       8       7
## 5      3       1       1       0
## 6      1       0       1       2

Exploring the aggregated data

HOW MANY 0s DOES EACH COUNTY HAVE?

numzeros_total = c()
for (i in 1:58){
  numzeros_total[i] = length(which(total.mortality.ts[i,3:74] == 0))
}

numzeros_total
##  [1]  0 69  6  0 10 42  0  8  0  0 32  0  0 20  0  1  0 31  0  0  0 31  1  0 25
## [26] 62  0  0  1  0  0 25  0  0 13  0  0  0  0  0  0  0  0  0  0 61  2  0  0  0
## [51]  3  2 38  0  3  0  0  0
propzeros_total = numzeros_total/72

length(which(propzeros_total > 0.85))
## [1] 2
countycodes$value_represented[which(propzeros_total > 0.75)]
## [1] "Alpine" "Mono"   "Sierra"
hist(propzeros_total,breaks = 20 ,xlab = "Proportion of months with 0 deaths",main = "Do some counties have more strings of 0s than others?")

One aspect of the data that we wanted to examine before proceeding with our analysis was the frequency in which there were 0 deaths in a given county for a month. This would inform us about whether a standard Poisson model or a zero inflated Poisson model would be more appropriate. What I did above was first calculate the proportion of months (out of 72) that had 0 deaths observed for each county. Then identified which counties had a proportion of 0s greater than 75%, 85%, etc. Then, I made a histogram which shows there are only a couple of counties (which have very small populations) that had a high frequency of 0s. The aggregation performed in previous steps addressed the zero inflation it appears.

AGGREGATING MORTALITY DATA INTO CLUSTERS AS OPPOSED TO EACH COUNTY (ALSO AGGREGATED TO MORTALITY RATE PER CLUSTER)

Again, we want the number of respiratory related deaths at the cluster level, not the county level. So we once again aggregate the observations for each county in a given cluster. First, we simply add all the observations in cluster together to get a total number of respiratory related deaths for the months of 2014-2019 for each cluster. Then, we also calculated a mortality rate (per 100k people) for each cluster. This was done by taking the total number of deaths for a given cluster and dividing it by the total population of that cluster times 100,000 i.e. (deaths\(*\frac{100000}{clusterpop}\)). This second dataset will be used for our Gaussian process regression model which needs to be fit on a continuous response variable.

#County populations by year pulled from SoA data
countypops = CA_data %>% filter(Year > 2013) %>% select(Total_Pop,County,Year) %>% unique()
countypops = cbind(countypops,Cluster = rep(clusterlabels$Cluster,each=6))

cluster_mortality_total = matrix(NA,nrow = 72,ncol = num_clus)

cluster_mortality_rate = matrix(NA,nrow = 72,ncol = num_clus)

for (i in 1:num_clus){
  cluster = total.mortality.ts %>% filter(Cluster == i)
  year = 2014
  
  for(j in 1:72){
    col = cluster[,j+3]
    
    #Sum of deaths across counties in a cluster
    cluster_mortality_total[j,i] = sum(na.omit(col))
    
    #Rate of deaths (per 100,000) across counties in a cluster
    pops = countypops %>% filter(Year == year,Cluster == i) %>% select(Total_Pop)
    cluster.pop = sum(pops)
    cluster_mortality_rate[j,i] = (sum(na.omit(col))/cluster.pop)*100000
    
    if ((j>12) & (j<25)){
      year = 2015
    }
    
    else if ((j>24) & (j<37)){
      year = 2016
    }
    
    else if ((j>36) & (j<49)){
      year = 2017
    }
    
    else if ((j>48) & (j<61)){
      year = 2018
    }
    
    else if ((j>60) & (j<73)){
      year = 2019
    }
    
    else{
      year = 2014
    }
  }
}

#Time series of total deaths for each cluster
colnames(cluster_mortality_total) = c("Cluster 1","Cluster 2","Cluster 3",
                                      "Cluster 4","Cluster 5","Cluster 6",
                                      "Cluster 7")
rownames(cluster_mortality_total) = colnames(total.mortality.ts[4:75])
cluster_mortality_total = data.frame(cluster_mortality_total)

head(cluster_mortality_total)
##        Cluster.1 Cluster.2 Cluster.3 Cluster.4 Cluster.5 Cluster.6 Cluster.7
## 1/2014        80        15        80       601       310        55      1101
## 2/2014        51         6        44       465       250        39       867
## 3/2014        47        15        42       437       234        32       854
## 4/2014        45        12        38       395       203        36       705
## 5/2014        45        17        36       360       195        31       710
## 6/2014        37         9        43       308       166        25       663
#Time series of rate of deaths (per 100,000) for each cluster
colnames(cluster_mortality_rate) = c("Cluster 1","Cluster 2","Cluster 3",
                                      "Cluster 4","Cluster 5","Cluster 6",
                                      "Cluster 7")
rownames(cluster_mortality_rate) = colnames(total.mortality.ts[4:75])
cluster_mortality_rate = data.frame(cluster_mortality_rate)

head(cluster_mortality_rate)
##        Cluster.1 Cluster.2 Cluster.3 Cluster.4 Cluster.5 Cluster.6 Cluster.7
## 1/2014  6.635007  6.013784  6.711308  5.644215  5.944301  5.431745  5.891607
## 2/2014  4.229817  2.405513  3.691219  4.366988  4.793791  3.851601  4.639440
## 3/2014  3.898066  6.013784  3.523437  4.104030  4.486989  3.160288  4.569875
## 4/2014  3.732191  4.811027  3.187871  3.709592  3.892558  3.555324  3.772555
## 5/2014  3.732191  6.815621  3.020089  3.380894  3.739157  3.061529  3.799310
## 6/2014  3.068691  3.608270  3.607328  2.892543  3.183077  2.468975  3.547807
# #Removes Sept-Dec 2014 NAs
# cluster_mortality_total = na.omit(cluster_mortality_total)

MAKE A TIME SERIES FOR EACH CLUSTER:

plot(ts(cluster_mortality_total$Cluster.1),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 1")

plot(ts(cluster_mortality_total$Cluster.2),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 2")

plot(ts(cluster_mortality_total$Cluster.3),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 3")

plot(ts(cluster_mortality_total$Cluster.4),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 4")

plot(ts(cluster_mortality_total$Cluster.5),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 5")

plot(ts(cluster_mortality_total$Cluster.6),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 6")

plot(ts(cluster_mortality_total$Cluster.7),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 7")

Fitting INLA models: spatial GLMM, Besag-York-Mollie (BYM) model, LGCPs

Now that all of the data from the SoA (used for SKATER and HUGE to get graph filter H), EPA (used to get gram matrix K), and Cal-ViDa (response) is downloaded, cleaned, and well formatted, we can now fit our kernel graph regression model as well as a few reference models, which we will compare against each other. For now, I have implemented a training-test data fitting approach to evaluating model performance.

First, I created a test dataset (inla_test_data) which has variables ID (which represents the cluster label), ID2 (which is basically an index label), response (cluster mortality), time (time index label 1-72), and months (month label 1-12). The response column inherently has NA values for time points 9-12 because those were not provided in the original Cal-ViDa dataset, but other than that, all observations are included. In the training dataset, I decided to hold out the last 3 months of the data (70-72 or Oct-Dec 2019) so I replaced those response values with NAs. This is how you get INLA to make predictions/forecasts because it does so based on the posterior predictive distribution.

Next, I created a second training dataset specifically for the GP regression model because we would be fitting the GP on the rates of death not the counts. The training dataset is called inla_gp_data. While the GP model will fit and predict a rate not a count, we still want to compare its predictions with respect to counts, so the test dataset is the same as before (inla_test_data). Once the predictions of rate of death per 100k are obtained from the GPR model, we will convert them back into counts using the cluster populations.

cluster_mortality_total_red = cluster_mortality_total[13:72,]
cluster_mortality_rate_red = cluster_mortality_rate[13:72,]

response = t(cluster_mortality_total_red)
response = as.vector(response)
response = ceiling(response)

response2 = t(cluster_mortality_rate_red)
response2 = as.vector(response2)

id = rep(c(1:7),60)
id2 = 1:(7*60)
time = rep(c(1:60),each = 7)

inla_full_data = data.frame(id,id2,response,time)

inla_full_data2 = data.frame(id,id2,response2,time)

months = rep(c(1:12),each = 7)
months = rep(months,5)
inla_full_data = cbind(inla_full_data,months)

#Experimented with defining each of these as factors
# inla_full_data$id = factor(inla_full_data$id) 
# inla_full_data$id2 = factor(inla_full_data$id2)
# inla_full_data$time = factor(inla_full_data$time)
inla_full_data$months = factor(inla_full_data$months)

#Add multiple intercept columns, one for each cluster 
Intercept1 = rep(c(1,NA,NA,NA,NA,NA,NA),60)
Intercept2 = rep(c(NA,1,NA,NA,NA,NA,NA),60)
Intercept3 = rep(c(NA,NA,1,NA,NA,NA,NA),60)
Intercept4 = rep(c(NA,NA,NA,1,NA,NA,NA),60)
Intercept5 = rep(c(NA,NA,NA,NA,1,NA,NA),60)
Intercept6 = rep(c(NA,NA,NA,NA,NA,1,NA),60)
Intercept7 = rep(c(NA,NA,NA,NA,NA,NA,1),60)

inla_full_data = cbind(inla_full_data,Intercept1,Intercept2,Intercept3,Intercept4,
          Intercept5,Intercept6,Intercept7)

# response = replace_na(response,0.01)
inla_gp_data = inla_full_data2

year = rep(2015:2019,each = 12)
inla_gp_data = cbind(inla_gp_data,year)

###Split into in sample and out of sample dataset
inla_outsample_data = inla_full_data

#Omit values for months 61-66 (out of sample dataset)
omit_idx = which(inla_outsample_data$time > 54)
inla_outsample_data$response[omit_idx] = NA
inla_insample_data = inla_full_data[-omit_idx,]

omit_idx = which(inla_gp_data$time > 54)
inla_gp_data$response2[omit_idx] = NA

IN SAMPLE FITTING ANALYSIS

Fit a simple Poisson GLMM for our mortality data (Reference model 1)

We wanted to compare the performance of our proposed model with a few reference models. The first one is a Poisson generalized linear mixed model. This model assumes the observed data follows a Poisson distribution and the hyperparameter \(\lambda_i\) can be modeled using a mixed effects model with a log link.

In other words,

\(Y_{i,t} \sim Pois(\lambda_{i,t})\) for \(i=1,...,7\) and \(t=1,...,54\) where \(log(\lambda_{i,t}) = \beta_0 + \beta_1 * I \{t=2,14,...,50 \} + ... + \beta_{11} * I \{t=12,24,...,54 \} + u_i\)

where the random effect \(u_i \sim MVN(0,\tau \Sigma)\)

We wanted the first reference model to be simple, so we assumed that the random effects \(u_i\) are iid. This means that \(\Sigma\) is simply a diagonal matrix of scaling factors. The hyperparameter \(log(\tau)\) is by default assigned a \(log \; \Gamma(1,0.00005)\) prior.

#Write a function to fit our poisson glmm in INLA 
ref_model1 = function(dataset,a_prior = 1,b_prior = 5e-05,link=1){
  ###Fit INLA model 
  prec_prior <- list(prec = list(prior = "loggamma", param = c(a_prior,b_prior)))
  ref_formula1 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
    Intercept5 + Intercept6 + Intercept7 + f(id,model = "iid", hyper = prec_prior) #could use id or id2 
  model = inla(formula = ref_formula1,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  ref_model1_results = list(
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model
  )
  
  return(ref_model1_results)
}

#Run model 
ref_model1_fit = ref_model1(dataset = inla_insample_data,a_prior = 1,b_prior = 1e-5)

#Extract DIC and WAIC
ref_model1_DIC = ref_model1_fit$model_DIC
ref_model1_WAIC = ref_model1_fit$model_WAIC

#Get summaries of parameter estimates
ref_model1_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.1040003 7.254715 -12.123616 2.1040003   16.33162 2.1040003
## months2    1.8618778 7.254716 -12.365741 1.8618778   16.08950 1.8618778
## months3    1.8812186 7.254716 -12.346400 1.8812186   16.10884 1.8812186
## months4    1.6930269 7.254717 -12.534594 1.6930269   15.92065 1.6930269
## months5    1.6124060 7.254718 -12.615217 1.6124060   15.84003 1.6124060
## months6    1.4969699 7.254719 -12.730655 1.4969699   15.72459 1.4969699
## months7    1.4681960 7.254722 -12.759434 1.4681960   15.69583 1.4681960
## months8    1.4344608 7.254722 -12.793170 1.4344608   15.66209 1.4344608
## months9    1.4008388 7.254723 -12.826793 1.4008388   15.62847 1.4008388
## months10   1.4724090 7.254722 -12.755221 1.4724090   15.70004 1.4724090
## months11   1.5162774 7.254721 -12.711351 1.5162774   15.74391 1.5162774
## months12   1.7814823 7.254719 -12.446141 1.7814823   16.00911 1.7814823
## Intercept1 2.1524562 7.254737 -12.075203 2.1524562   16.38012 2.1524562
## Intercept2 0.6814394 7.254819 -13.546382 0.6814394   14.90926 0.6814394
## Intercept3 2.0751109 7.254739 -12.152552 2.0751109   16.30277 2.0751109
## Intercept4 4.2895290 7.254715  -9.938087 4.2895290   18.51715 4.2895290
## Intercept5 3.6523905 7.254717 -10.575231 3.6523905   17.88001 3.6523905
## Intercept6 1.8619689 7.254745 -12.365706 1.8619689   16.08964 1.8619689
## Intercept7 5.0102692 7.254713  -9.217344 5.0102692   19.23788 5.0102692
##                     kld
## months1    5.527829e-11
## months2    5.527826e-11
## months3    5.527827e-11
## months4    5.527825e-11
## months5    5.527837e-11
## months6    5.527835e-11
## months7    5.527845e-11
## months8    5.527831e-11
## months9    5.527844e-11
## months10   5.527832e-11
## months11   5.527846e-11
## months12   5.527850e-11
## Intercept1 5.527837e-11
## Intercept2 5.527832e-11
## Intercept3 5.527832e-11
## Intercept4 5.527842e-11
## Intercept5 5.527837e-11
## Intercept6 5.527837e-11
## Intercept7 5.527841e-11
ref_model1_fit$bri_hyperpar_summary
##                  mean          sd      q0.025        q0.5     q0.975
## SD for id 0.005177222 0.004471684 0.001635036 0.003776738 0.01824029
##                  mode
## SD for id 0.002548752
ref_model1_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.198903   6.435810   6.561496   5.435910   5.014863   4.468129   4.341396 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.197381   4.058603   4.359725   4.555236   5.938653   8.605970   1.976721 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.965429  72.932108  38.566749   6.436397 149.945090
#Show plots
ref_model1_fit$param_plot

ref_model1_fit$hyperparam_plot

Note: SD for ID: standard deviation for the means (avg intensities) corresponding to the 8 different clusters was ???

Plot of posterior predictive estimates with credible interval bands OVERLAID on response:

#Write a function to make plot of posterior predictive estimates with credible interval bands OVERLAID on response
pp_insample_plot = function(num_plots = num_clus, ref_data = inla_insample_data, pred_data){
  for (i in 1:num_plots){
  df = ref_data %>% filter(id == i) %>% select(response)
  preds = pred_data %>% filter(id == i) 
  df = cbind(df,preds)
  
  title = sprintf("Posterior Predictive Fits for Cluster %s",i)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_ribbon(aes(ymin = `0.025quant`,ymax = `0.975quant`),alpha = 0.3) + ggtitle(title)
  print(post_pred_plot)
  }
}
#Plot ref_model1 pp plot  
pp_insample_plot(pred_data = ref_model1_fit$fitted_values)

Fitting a Besag-York-Mollie model (Reference model 2)

For our second reference model, we decided to fit a Besag-York-Mollie model, which is a log-normal Poisson model with an intrinsic conditional autoregressive component to capture spatial autocorrelations i.e. a Besag model, plus a standard random effects term which is included to capture non-spatial heterogeneity. Obviously, this model is less naive than reference model 1 because it does not assume iid random effects.

The BYM model can be written as,

\(Y_{i,t} \sim Pois(\lambda_{i,t})\) for \(i=1,...,7\) and \(t=1,...,54\) where \(log(\lambda_{i,t}) = \beta_0 + \beta_1 * I \{t=2,14,...,50 \} + ... + \beta_{11} * I \{t=12,24,...,54 \} + \phi + u_i\)

where \(p(\phi) \propto exp(-\frac{1}{2} \sum_{i \sim j} (\phi_i - \phi_j)^2)\) and \(u_i \sim MVN(0,\tau \Sigma)\).

Note: it is more commonly known that ICAR components are conditionally normally distributed.

As one can see below, the summary outputs indicate that this model is very similar to the Poisson GLMM (reference model 1). The intercept and SD for the random effect component are estimated to almost the exact same as those estimated by the Poisson GLMM, indicating that including the spatial ICAR component is seemingly not very impactful.

#Write a function to fit our BYM model in INLA 
ref_model2 = function(dataset,a_prec_prior = 1,b_prec_prior = 5e-04,a_phi_prior = 1,b_phi_prior = 5e-04,link=1){
  ###Fit INLA model 
  bym_prior <- list(
  prec.unstruct = list(
    prior = "loggamma",
    param = c(a_prec_prior,b_prec_prior)),
  prec.spatial = list(
    prior = "loggamma",
    param = c(a_phi_prior,b_phi_prior))
  )
  ref_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
    Intercept5 + Intercept6 + Intercept7 + 
    f(id, model = "bym", graph = huge.est, hyper = bym_prior) #ID2 in formula results in error 
  model = inla(formula = ref_formula2,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  ref_model2_results = list(
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model
  )
  
  return(ref_model2_results)
}

#Fit ref_model2
ref_model2_fit = ref_model2(dataset = inla_insample_data,a_prec_prior = 1,b_prec_prior = 1e-5,
                            a_phi_prior = 2,b_phi_prior = 0.5005)

#Extract DIC and WAIC
ref_model2_DIC = ref_model2_fit$model_DIC
ref_model2_WAIC = ref_model2_fit$model_WAIC

#Get summaries of parameter estimates
ref_model2_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.1040002 7.254715 -12.123616 2.1040002   16.33162 2.1040002
## months2    1.8618776 7.254716 -12.365741 1.8618776   16.08950 1.8618776
## months3    1.8812185 7.254716 -12.346400 1.8812185   16.10884 1.8812185
## months4    1.6930268 7.254717 -12.534594 1.6930268   15.92065 1.6930268
## months5    1.6124059 7.254718 -12.615217 1.6124059   15.84003 1.6124059
## months6    1.4969697 7.254719 -12.730655 1.4969697   15.72459 1.4969697
## months7    1.4681958 7.254722 -12.759434 1.4681958   15.69583 1.4681958
## months8    1.4344607 7.254722 -12.793170 1.4344607   15.66209 1.4344607
## months9    1.4008387 7.254723 -12.826793 1.4008387   15.62847 1.4008387
## months10   1.4724089 7.254722 -12.755221 1.4724089   15.70004 1.4724089
## months11   1.5162773 7.254721 -12.711352 1.5162773   15.74391 1.5162773
## months12   1.7814822 7.254719 -12.446141 1.7814822   16.00911 1.7814822
## Intercept1 2.1525450 7.259066 -12.083608 2.1525448   16.38870 2.1525446
## Intercept2 0.6814789 7.260532 -13.557552 0.6814784   14.92051 0.6814775
## Intercept3 2.0752552 7.260396 -12.163509 2.0752549   16.31402 2.0752545
## Intercept4 4.2894110 7.260285  -9.949138 4.2894114   18.52796 4.2894120
## Intercept5 3.6522438 7.262452 -10.590563 3.6522443   17.89505 3.6522453
## Intercept6 1.8620586 7.259134 -12.374229 1.8620585   16.09835 1.8620582
## Intercept7 5.0101701 7.259045  -9.225943 5.0101703   19.24628 5.0101708
##                     kld
## months1    5.527829e-11
## months2    5.527840e-11
## months3    5.527827e-11
## months4    5.527838e-11
## months5    5.527837e-11
## months6    5.527836e-11
## months7    5.527832e-11
## months8    5.527844e-11
## months9    5.527844e-11
## months10   5.527832e-11
## months11   5.527832e-11
## months12   5.527850e-11
## Intercept1 5.512345e-11
## Intercept2 5.500927e-11
## Intercept3 5.501457e-11
## Intercept4 5.502261e-11
## Intercept5 5.478661e-11
## Intercept6 5.511931e-11
## Intercept7 5.512353e-11
ref_model2_fit$bri_hyperpar_summary
##                                      mean          sd      q0.025        q0.5
## SD for id (idd component)     0.004715938 0.002874205 0.001728585 0.003884146
## SD for id (spatial component) 0.609526483 0.237151581 0.303610000 0.555125878
##                                  q0.975        mode
## SD for id (idd component)     0.0124817 0.002813492
## SD for id (spatial component) 1.2168272 0.463517818
ref_model2_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.198902   6.435810   6.561495   5.435909   5.014862   4.468129   4.341396 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.197381   4.058603   4.359725   4.555236   5.938652   8.606734   1.976799 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.966579  72.923507  38.561092   6.436974 149.930235
#Show plots
ref_model2_fit$param_plot

ref_model2_fit$hyperparam_plot

pp_insample_plot(pred_data = ref_model2_fit$fitted_values)

Fitting kernel graph regression models

KGR model with time series kernel x graph filter (Proposed model 2)

Finally, we fit our proposed model which we call a kernel graph regression model. It also takes the form of a latent Gaussian model as shown below:

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{i,t} | \textbf{X} = exp(\beta_0 + F_{i,t})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,K \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(x_{t_1},x_{t_2})(H^2)_{n_1,n_2}\).

The key difference here is that the covariance matrix of this GP is specified by the kronecker product of \(K\), which is the time kernel gram matrix calculated from the EPA air quality data, and \(H\), which is the graph filter which is calculated from the adjacency matrix estimated by glasso using the HUGE package. This matrix is completely known and can be directly plugged into INLA as the covariance matrix of our underlying GP using the “generic0” specification as shown below:

Calculating gram matrix K from EPA data

Using the EPA air quality data, we can calculate the gram matrix K which will characterize the dependence structure of air quality (across 7 different pollutants and AQI) over time. This is done by calculating the squared difference between all of the observations at two different time points e.g. 64 observations for Jan 2014 compared with the 64 observations for Feb 2014. For the in sample analysis, the resulting matrix is 66x66 because we are holding out the last 6 months of observations.

EPA_kernel = function(EPA_data = final_EPA_agg_data,time_span,rho_rbf,rho_periodic,sigma2){
  K_EPA = matrix(0,nrow=time_span,ncol=time_span)
  i = 1
  j = 1
  
  for(t1 in 1:time_span){
    for (t2 in 1:time_span){
      A = EPA_data %>% filter(Time == t1)
      B = EPA_data %>% filter(Time == t2)
      AQIa = unique(A$AQI)
      AQIb = unique(B$AQI)
      
      ABtest = c((A$Value-B$Value)^2,(AQIa-AQIb)^2) #7 clusters * 8 measurements 
      # K_EPA[i,j] = exp(-sum(ABtest) / (2*rho_rbf)) * sigma2
      
      K_EPA[i,j] = exp(- (sum(ABtest)^2)
                     / (2*rho_rbf)) * exp(- (2*sin(sum(abs(ABtest))*pi/12)^2)
                     / (rho_periodic)) * sigma2
      
      j = j+1
    }
    
    j = 1
    i = i+1
  }
  
  return(K_EPA)
}

Ensuring precision matrix is not computationally singular, so we jitter eigenvalues, using reciprocal condition number as constraint

desingularize = function(covmatrix,threshold = 1e-2,increment = 0.01){
  
  tracker = 0
  
  while (rcond(covmatrix) <= threshold){
    #Perform spectral decomposition
    ev = eigen(covmatrix)
    L = ev$values
    V = ev$vectors
    
    # #Add a little noise to eigenvalues to bring away from 0
    L = L + increment
    
    # #Add 0.01 to eigenvalues in bottom five percentile to bring away from 0
    # cutoff = quantile(abs(L),0.05)
    # L[which(abs(L) < cutoff)] = L[which(abs(L) < cutoff)] + 0.01
    
    #Calculate new precision matrix 
    covmatrix = V %*% diag(L) %*% t(V)
    
    tracker = tracker + 1
  }
  
  results_list = list(covmatrix,tracker)
  #sprintf("%s iterations of desingularizer applied",tracker)
  return(results_list)
}

# test = desingularize(K_time)

GLMM with type 0 generic specification (known covariance matrix)

kgr_model2 = function(dataset, rho_EPA_rbf = 1, rho_EPA_periodic = 1, sigma2_EPA = 1,link=1){
  
  #Calculate gram matrix K_EPA
  K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
                     rho_rbf = rho_EPA_rbf, rho_periodic = rho_EPA_periodic, sigma2 = sigma2_EPA)
  
  #Heatmap of resulting K 
  K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA Heatmap")
  
  #Calculate trace norm of gram matrix
  K_EPA_weight = norm((1/60)*K_EPA,type = "F")
  
  ###Load graph regression kernel 
  covGP = kronecker(K_EPA/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP,threshold = 1e-2,increment = 0.01)
  covGP = covGP_jittered[[1]]
  inv_covGP = solve(covGP)
  # cov_Fnorm = norm(covGP,type = "F")
  
  #Heatmap of resulting K 
  inv_covGP_heatmap = matrix_heatmap(inv_covGP,title = "Precision matrix heatmap")
  
  ###Fit INLA model 
  # kgr_formula2 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
  #   Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP)
  
  kgr_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
    Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP)
  
  
  model = inla(formula = kgr_formula2,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values

  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model2_results = list(
    K_EPA_heatmap = K_EPA_heatmap,
    K_EPA_weight = K_EPA_weight/(K_EPA_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_EPA_weight + gfilter_weight),
    covmatrix = covGP,
    prec = inv_covGP,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model2_results)
}

#Fit kgr_model2
kgr_model2_fit = kgr_model2(dataset = inla_insample_data,rho_EPA_rbf = 2429.591,rho_EPA_periodic = 1612.206, sigma2_EPA = 4.918)

#Extract DIC and WAIC 
kgr_model2_DIC = kgr_model2_fit$model_DIC
kgr_model2_WAIC = kgr_model2_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model2_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.1036794 7.254733  -12.12397 2.1036794   16.33133 2.1036794
## months2    1.8633345 7.254733  -12.36432 1.8633345   16.09099 1.8633345
## months3    1.8865785 7.254734  -12.34108 1.8865785   16.11423 1.8865785
## months4    1.6946797 7.254736  -12.53298 1.6946797   15.92234 1.6946797
## months5    1.6150746 7.254738  -12.61259 1.6150746   15.84274 1.6150746
## months6    1.4936942 7.254738  -12.73397 1.4936942   15.72136 1.4936942
## months7    1.4569220 7.254746  -12.77075 1.4569220   15.68460 1.4569220
## months8    1.4344156 7.254746  -12.79326 1.4344156   15.66209 1.4344156
## months9    1.3942400 7.254747  -12.83344 1.3942400   15.62192 1.3942400
## months10   1.4709639 7.254746  -12.75671 1.4709639   15.69864 1.4709639
## months11   1.5075661 7.254746  -12.72011 1.5075661   15.73524 1.5075661
## months12   1.7800556 7.254742  -12.44761 1.7800556   16.00773 1.7800556
## Intercept1 2.1515512 7.254754  -12.07614 2.1515512   16.37925 2.1515512
## Intercept2 0.6702461 7.254866  -13.55767 0.6702461   14.89816 0.6702461
## Intercept3 2.0671562 7.254771  -12.16057 2.0671562   16.29488 2.0671562
## Intercept4 4.2884079 7.254741   -9.93926 4.2884079   18.51608 4.2884079
## Intercept5 3.6610231 7.254751  -10.56666 3.6610231   17.88871 3.6610231
## Intercept6 1.8592251 7.254757  -12.36847 1.8592251   16.08692 1.8592251
## Intercept7 5.0035943 7.254724   -9.22404 5.0035943   19.23123 5.0035943
##                     kld
## months1    5.527841e-11
## months2    5.527842e-11
## months3    5.527840e-11
## months4    5.527838e-11
## months5    5.527847e-11
## months6    5.527833e-11
## months7    5.527836e-11
## months8    5.527836e-11
## months9    5.527834e-11
## months10   5.527849e-11
## months11   5.527822e-11
## months12   5.527841e-11
## Intercept1 5.527836e-11
## Intercept2 5.527828e-11
## Intercept3 5.527837e-11
## Intercept4 5.527826e-11
## Intercept5 5.527844e-11
## Intercept6 5.527832e-11
## Intercept7 5.527838e-11
kgr_model2_fit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.4337749 0.03419719 0.3698794 0.4325623 0.5041908 0.4302724
kgr_model2_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.196272   6.445192   6.596759   5.444902   5.028263   4.453517   4.292726 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.197191   4.031909   4.353429   4.515726   5.930186   8.598186   1.954718 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.902318  72.850392  38.901123   6.418761 148.947561
kgr_model2_fit$K_EPA_weight
## [1] 0.9117156
kgr_model2_fit$gfilter_weight
## [1] 0.08828444
kgr_model2_fit$num_jitters
## [1] 2
#Show plots
kgr_model2_fit$prec_heatmap

kgr_model2_fit$K_EPA_heatmap

kgr_model2_fit$param_plot

kgr_model2_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model2_fit$fitted_values)

We can also simplify the covariance of our underlying GP and see how our proposed model compares with a simplified version with a simple time kernel:

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{t} | \textbf{X} = exp(\beta_0 + F_{t})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,Ktime)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(t_1,t_2)\).

Instead of calculating gram matrix K based on covariate (EPA variables) similarity, our gram matrix K is simply a time kernel where similar values of t (months 1-72) have larger covariances. As a result, this model has no spatial dependence structure built in.

Calculating simple time kernel K

time_kernel = function(time_span,rho_rbf,rho_periodic,sigma2){
  K_time = matrix(NA,nrow = time_span, ncol = time_span)
  
  for (i in 1:time_span){
    for (j in 1:time_span){
      # K_time[i,j] = exp(- (abs(i-j)^2) / (2*rho)) * sigma2
      
      K_time[i,j] = exp(- (abs(i-j)^2) / (2*rho_rbf)) * exp(- (2*sin(sum(abs(i-j))*pi/12)^2)
                     / (rho_periodic)) * sigma2
    }
  }
  
  return(K_time)
}

LGCP with temporal kernel (Reference model 3)

Since there is no spatial component in this model, each cluster can be fit separately.

ref_model3 = function(dataset, cluster, rho_time_rbf = 1, rho_time_periodic = 1, sigma2_time = 1,link=1){
  
  #Calculating gram matrix K_time
  K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                       rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  #Calculate trace norm of gram matrix
  K_time_weight = norm(K_time,type = "F")
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(K_time,threshold = 1e-2,increment = 0.01)
  K_time = covGP_jittered[[1]]
  inv_K_time = solve(K_time)
  # cov_Fnorm = norm(K_time,type = "F")

  
  #Heatmap of resulting inv_K_time 
  inv_K_time_heatmap = matrix_heatmap(inv_K_time,title = "Precision matrix heatmap")
  
  
  ###Fitting the model on each cluster 
  inla_test_clus_data = dataset %>% filter(id == cluster)
  
  ref_formula3 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
    Intercept5 + Intercept6 + Intercept7 + f(time,model = "generic0",Cmatrix = inv_K_time)
  
  model = inla(ref_formula3, data = inla_test_clus_data, family = "poisson",
                    control.compute = list(dic=TRUE,waic=TRUE),
                    control.inla = list(strategy = "laplace"),
                    control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(inla_test_clus_data$id, inla_test_clus_data$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  ref_model3_results = list(
    K_time_heatmap = K_time_heatmap,
    K_time_weight = K_time_weight/(K_time_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_time_weight + gfilter_weight),
    covmatrix = K_time,
    prec = inv_K_time,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_K_time_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model
  )
  
  return(ref_model3_results)
}

#Fit ref_model3 on one cluster (to test)
ref_model3_fit = ref_model3(dataset = inla_insample_data, cluster = 2, rho_time_rbf = 1,
                   rho_time_periodic = 1, sigma2_time = 5)

#Extract DIC and WAIC
ref_model3_DIC = ref_model3_fit$model_DIC
ref_model3_WAIC = ref_model3_fit$model_WAIC

#Get summaries of parameter estimates
ref_model3_fit$model_summary
##                    mean          sd 0.025quant      0.5quant 0.975quant
## Intercept1 6.401948e-15 31.62254149 -62.016688 -9.016828e-15  62.016688
## Intercept2 2.363644e+00  0.04200925   2.281254  2.363644e+00   2.446034
## Intercept3 6.401948e-15 31.62254149 -62.016688 -9.016828e-15  62.016688
## Intercept4 6.401948e-15 31.62254149 -62.016688 -9.016828e-15  62.016688
## Intercept5 6.401948e-15 31.62254149 -62.016688 -9.016828e-15  62.016688
## Intercept6 6.401948e-15 31.62254149 -62.016688 -9.016828e-15  62.016688
## Intercept7 6.401948e-15 31.62254149 -62.016688 -9.016828e-15  62.016688
##                    mode          kld
## Intercept1 5.030787e-23 5.527836e-11
## Intercept2 2.363645e+00 3.296471e-11
## Intercept3 0.000000e+00 5.527836e-11
## Intercept4 0.000000e+00 5.527836e-11
## Intercept5 0.000000e+00 5.527836e-11
## Intercept6 0.000000e+00 5.527836e-11
## Intercept7 0.000000e+00 5.527836e-11
ref_model3_fit$bri_hyperpar_summary
##                    mean         sd      q0.025       q0.5     q0.975
## SD for time 0.009940187 0.00637071 0.003620911 0.00802757 0.02789661
##                    mode
## SD for time 0.005716609
ref_model3_fit$exp_effects
## Intercept1 Intercept2 Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##    1.00000   10.62962    1.00000    1.00000    1.00000    1.00000    1.00000
ref_model3_fit$K_time_weight
## [1] 0.9949444
ref_model3_fit$gfilter_weight
## [1] 0.005055571
ref_model3_fit$num_jitters
## [1] 0
#Show plots
ref_model3_fit$prec_heatmap

ref_model3_fit$K_time_heatmap

ref_model3_fit$param_plot

ref_model3_fit$hyperparam_plot

test1 = ref_model3(dataset = inla_insample_data, cluster = 1, rho_time_rbf = 214.194,
                   rho_time_periodic = 4.192, sigma2_time = 2.166)
test2 = ref_model3(dataset = inla_insample_data, cluster = 2, rho_time_rbf = 301.021,
                   rho_time_periodic = 1.765, sigma2_time = 1.148)
test3 = ref_model3(dataset = inla_insample_data, cluster = 3, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748)
test4 = ref_model3(dataset = inla_insample_data, cluster = 4, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748) #why does sigma2 have to be so big here??
test5 = ref_model3(dataset = inla_insample_data, cluster = 5, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748)
test6 = ref_model3(dataset = inla_insample_data, cluster = 6, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748) #why does sigma2 have to be so big here??
test7 = ref_model3(dataset = inla_insample_data, cluster = 7, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748)

ref_model3_fvs = rbind(test1$fitted_values,test2$fitted_values,test3$fitted_values,
                       test4$fitted_values,test5$fitted_values,test6$fitted_values,
                       test7$fitted_values)

pp_insample_plot(num_plots = num_clus,ref_data = inla_insample_data,pred_data = ref_model3_fvs)

KGR model with temporal kernel x graph filter (Proposed model 1)

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{i,t} | \textbf{X} = exp(\beta_0 + F_{i,t})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,Ktime \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(t_1,t_2)(H^2)_{n_1,n_2}\).

kgr_model1 = function(dataset, rho_time_rbf = 1, rho_time_periodic = 1, sigma2_time = 1, link=1){
  
  #Calculating gram matrix K_time
   K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                        rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  #Calculate trace norm of gram matrix
  K_time_weight = norm((1/60)*K_time,type = "F")
  
  #Calculate proposed kernel
  covGP2 = kronecker(K_time/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP2,threshold = 1e-2,increment = 0.01)
  covGP2 = covGP_jittered[[1]]
  
  inv_covGP2 = solve(covGP2)
  
  #Heatmap of resulting inv_covGP2 
  inv_covGP2_heatmap = matrix_heatmap(inv_covGP2,title = "Precision matrix heatmap")
  
  ###Fit INLA model 
  # kgr_formula1 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
  #   Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP2)
  
  kgr_formula1 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + 
    Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP2)
  
  model = inla(formula = kgr_formula1,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model1_results = list(
    K_time_heatmap = K_time_heatmap,
    K_time_weight = K_time_weight/(K_time_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_time_weight + gfilter_weight),
    covmatrix = covGP2,
    prec = inv_covGP2,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP2_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model1_results)
}

#Fit kgr_model1
kgr_model1_fit = kgr_model1(dataset = inla_insample_data,rho_time_rbf = 557.614,rho_time_periodic = 545.753,sigma2_time = 4.581)

#Extract DIC and WAIC
kgr_model1_DIC = kgr_model1_fit$model_DIC
kgr_model1_WAIC = kgr_model1_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model1_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0952719 7.254730 -12.132375 2.0952719   16.32292 2.0952719
## months2    1.8666137 7.254732 -12.361037 1.8666137   16.09426 1.8666137
## months3    1.8883025 7.254732 -12.339348 1.8883025   16.11595 1.8883025
## months4    1.6937793 7.254734 -12.533874 1.6937793   15.92143 1.6937793
## months5    1.6137948 7.254735 -12.613861 1.6137948   15.84145 1.6137948
## months6    1.4903349 7.254736 -12.737323 1.4903349   15.71799 1.4903349
## months7    1.4489731 7.254743 -12.778699 1.4489731   15.67665 1.4489731
## months8    1.4272112 7.254744 -12.800461 1.4272112   15.65488 1.4272112
## months9    1.3862836 7.254744 -12.841390 1.3862836   15.61396 1.3862836
## months10   1.4672182 7.254743 -12.760453 1.4672182   15.69489 1.4672182
## months11   1.5104853 7.254742 -12.717185 1.5104853   15.73816 1.5104853
## months12   1.7703115 7.254739 -12.457352 1.7703115   15.99797 1.7703115
## Intercept1 2.1479165 7.254801 -12.079869 2.1479165   16.37570 2.1479165
## Intercept2 0.6788217 7.254995 -13.549344 0.6788217   14.90699 0.6788217
## Intercept3 2.0616424 7.254865 -12.166268 2.0616424   16.28955 2.0616424
## Intercept4 4.2663208 7.254826  -9.961514 4.2663208   18.49416 4.2663208
## Intercept5 3.6420912 7.254863 -10.585815 3.6420912   17.87000 3.6420912
## Intercept6 1.8580507 7.254784 -12.369702 1.8580507   16.08580 1.8580507
## Intercept7 5.0037366 7.254751  -9.223950 5.0037366   19.23142 5.0037366
##                     kld
## months1    5.527831e-11
## months2    5.527843e-11
## months3    5.527829e-11
## months4    5.527827e-11
## months5    5.527838e-11
## months6    5.527837e-11
## months7    5.527826e-11
## months8    5.527826e-11
## months9    5.527852e-11
## months10   5.527826e-11
## months11   5.527841e-11
## months12   5.527847e-11
## Intercept1 5.527846e-11
## Intercept2 5.527847e-11
## Intercept3 5.527843e-11
## Intercept4 5.527848e-11
## Intercept5 5.527845e-11
## Intercept6 5.527844e-11
## Intercept7 5.527845e-11
kgr_model1_fit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.6306707 0.04938199 0.5382832 0.6289606 0.7322413 0.6257159
kgr_model1_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.127651   6.466362   6.608142   5.440002   5.021832   4.438582   4.258739 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.167062   3.999957   4.337153   4.528928   5.872682   8.566990   1.971553 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.858867  71.258979  38.171579   6.411227 148.968759
kgr_model1_fit$K_time_weight
## [1] 0.9308613
kgr_model1_fit$gfilter_weight
## [1] 0.06913872
kgr_model1_fit$num_jitters
## [1] 1
#Show plots
kgr_model1_fit$K_time_heatmap

kgr_model1_fit$prec_heatmap

kgr_model1_fit$param_plot

kgr_model1_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model1_fit$fitted_values)

Finally, we could also increase the complexity of our proposed model by including our time kernel in the covariance structure of the underlying GP. Notice that K does not explicitly have a temporal dependence structure; instead, it represents EPA covariate similarity compared across months. We can explicitly include the time kernel above by either element wise adding or multiplying K_EPA and K_time together before taking the kronecker product with \(H^2\)

Proposed model 3

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{it} | \textbf{X} = exp(\beta_0 + F_{it})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,(Ktime \odot K_{EPA}) \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(t_1,t_2)k(x_{t_1},x_{t_2})(H^2)_{n_1,n_2}\).

kgr_model3 = function(dataset,rho_EPA_rbf = 1,rho_EPA_periodic = 1,rho_time_rbf = 1,rho_time_periodic = 1,sigma2 = 1,link=1){
  
  ###Calculating gram matrix K_EPA
  K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
                     rho_rbf = rho_EPA_rbf,rho_periodic = rho_EPA_periodic,sigma2 = 1)
  
  #Heatmap of resulting K 
  K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA heatmap")
  
  ###Calculating gram matrix K_time
 K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                      rho_periodic = rho_time_periodic, sigma2 = sigma2)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  K_EPA_norm = norm(K_EPA,type = "F")
  K_time_norm = norm(K_time,type = "F")
  
  #Calculate trace norm of gram matrix
  gram = (K_EPA*K_time)/sigma2
  K_weight = norm((1/60)*gram,type = "F")
  
  
  ###Load graph regression kernel 
  # covGP3 = kronecker(gram,H^2)
  covGP3 = kronecker(gram/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP3,threshold = 1e-2,increment = 0.01)
  covGP3 = covGP_jittered[[1]]
  
  inv_covGP3 = solve(covGP3)
  
  #Heatmap of resulting K 
  inv_covGP3_heatmap = matrix_heatmap(inv_covGP3,title = "Precision matrix heatmap")
  
  ###Fit INLA model 
  # kgr_formula3 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
  #   Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP3)
  
  kgr_formula3 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
    Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP3)
  
  model = inla(formula = kgr_formula3,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "gaussian"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model3_results = list(
    K_EPA_heatmap = K_EPA_heatmap,
    K_time_heatmap = K_time_heatmap,
    K_EPA_weight = K_EPA_norm / (K_EPA_norm + K_time_norm),
    K_time_weight = K_time_norm / (K_EPA_norm + K_time_norm),
    K_weight = K_weight/(K_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
    covmatrix = covGP3,
    prec = inv_covGP3,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP3_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model3_results)
}

#Fit kgr_model3
kgr_model3_fit = kgr_model3(dataset = inla_insample_data, rho_EPA_rbf = 2871.019, rho_EPA_periodic = 2269.161,
                            rho_time_rbf = 3079.136, rho_time_periodic = 1060.033, sigma2 = 1.115)

#Extract DIC and WAIC
kgr_model3_DIC = kgr_model3_fit$model_DIC
kgr_model3_WAIC = kgr_model3_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model3_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0994408 7.254731 -12.128207 2.0994408   16.32709 2.0994408
## months2    1.8659602 7.254731 -12.361688 1.8659602   16.09361 1.8659602
## months3    1.8895147 7.254732 -12.338135 1.8895147   16.11716 1.8895147
## months4    1.6972207 7.254734 -12.530433 1.6972207   15.92487 1.6972207
## months5    1.6172179 7.254735 -12.610438 1.6172179   15.84487 1.6172179
## months6    1.4940561 7.254736 -12.733602 1.4940561   15.72171 1.4940561
## months7    1.4571997 7.254743 -12.770472 1.4571997   15.68487 1.4571997
## months8    1.4349623 7.254743 -12.792710 1.4349623   15.66263 1.4349623
## months9    1.3943579 7.254744 -12.833316 1.3943579   15.62203 1.3943579
## months10   1.4719710 7.254743 -12.755700 1.4719710   15.69964 1.4719710
## months11   1.5095814 7.254743 -12.718090 1.5095814   15.73725 1.5095814
## months12   1.7764190 7.254739 -12.451245 1.7764190   16.00408 1.7764190
## Intercept1 2.1515823 7.254746 -12.076095 2.1515823   16.37926 2.1515823
## Intercept2 0.6767339 7.254842 -13.551131 0.6767339   14.90460 0.6767339
## Intercept3 2.0707874 7.254755 -12.156908 2.0707874   16.29848 2.0707874
## Intercept4 4.2865395 7.254730  -9.941106 4.2865395   18.51418 4.2865395
## Intercept5 3.6554541 7.254736 -10.572203 3.6554541   17.88311 3.6554541
## Intercept6 1.8599247 7.254752 -12.367764 1.8599247   16.08761 1.8599247
## Intercept7 5.0068798 7.254720  -9.220746 5.0068798   19.23451 5.0068798
##                     kld
## months1    5.527831e-11
## months2    5.527831e-11
## months3    5.527830e-11
## months4    5.527840e-11
## months5    5.527839e-11
## months6    5.527823e-11
## months7    5.527840e-11
## months8    5.527840e-11
## months9    5.527838e-11
## months10   5.527854e-11
## months11   5.527840e-11
## months12   5.527846e-11
## Intercept1 5.527835e-11
## Intercept2 5.527851e-11
## Intercept3 5.527848e-11
## Intercept4 5.527844e-11
## Intercept5 5.527824e-11
## Intercept6 5.527841e-11
## Intercept7 5.527835e-11
kgr_model3_fit$bri_hyperpar_summary
##                 mean         sd    q0.025     q0.5    q0.975      mode
## SD for id2 0.6341904 0.04961669 0.5414159 0.632454 0.7362939 0.6291667
kgr_model3_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.161605   6.462138   6.616157   5.458755   5.039052   4.455129   4.293918 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.199487   4.032385   4.357816   4.524836   5.908659   8.598453   1.967441 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.931066  72.714402  38.685086   6.423253 149.437738
kgr_model3_fit$K_EPA_weight
## [1] 0.3538621
kgr_model3_fit$K_time_weight
## [1] 0.6461379
kgr_model3_fit$K_weight
## [1] 0.6731252
kgr_model3_fit$gfilter_weight
## [1] 0.3268748
kgr_model3_fit$num_jitters
## [1] 1
#Show plots
kgr_model3_fit$K_time_heatmap

kgr_model3_fit$K_EPA_heatmap

kgr_model3_fit$prec_heatmap

kgr_model3_fit$param_plot

kgr_model3_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model3_fit$fitted_values)

Proposed model 4

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{it} | \textbf{X} = exp(\beta_0 + F_{it})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,\frac{1}{2}(Ktime + K_{EPA}) \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = (k(t_1,t_2)+k(x_{t_1},x_{t_2}))(H^2)_{n_1,n_2}\).

kgr_model4 = function(dataset,rho_EPA_rbf = 1, rho_EPA_periodic = 1,
                               rho_time_rbf = 1, rho_time_periodic = 1, sigma2_EPA = 1, sigma2_time = 1,link = 1){
  
  ###Calculating gram matrix K_EPA
  K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
                     rho_rbf = rho_EPA_rbf,rho_periodic = rho_EPA_periodic,sigma2 = 1)
  
  #Heatmap of resulting K 
  K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA heatmap")
  
  ###Calculating gram matrix K_time
  K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf, 
                    rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
  
  #Heatmap of resulting K 
  K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
  
  K_EPA_norm = norm(K_EPA,type = "F")
  K_time_norm = norm(K_time,type = "F")
  
  gram = 0.5*(K_time+K_EPA)
  K_weight = norm((1/60)*gram,type = "F")

  ###Load graph regression kernel 
  # covGP4 = kronecker(gram,(H^2))
  covGP4 = kronecker(gram/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP4,threshold = 1e-2,increment = 0.01)
  covGP4 = covGP_jittered[[1]]
  
  inv_covGP4 = solve(covGP4)
  
  #Heatmap of resulting K 
  inv_covGP4_heatmap = matrix_heatmap(inv_covGP4,title = "Precision matrix heatmap")
  
  
  ###Fit INLA model 
  # kgr_formula4 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
  #   Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP4)
  
  kgr_formula4 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
    Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP4)
  
  model = inla(formula = kgr_formula4,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model4_results = list(
    K_EPA_heatmap = K_EPA_heatmap,
    K_time_heatmap = K_time_heatmap,
    K_EPA_weight = K_EPA_norm / (K_EPA_norm + K_time_norm),
    K_time_weight = K_time_norm / (K_EPA_norm + K_time_norm),
    K_weight = K_weight/(K_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
    covmatrix = covGP4,
    prec = inv_covGP4,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP4_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model4_results)
}

#Fit kgr_model4
kgr_model4_fit = kgr_model4(dataset = inla_insample_data, rho_EPA_rbf = 446.638, rho_EPA_periodic = 413.201,
                            rho_time_rbf = 569.756, rho_time_periodic = 550.984, sigma2_EPA = 1.811, sigma2_time = 4.961)

#Extract DIC and WAIC 
kgr_model4_DIC = kgr_model4_fit$model_DIC
kgr_model4_WAIC = kgr_model4_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model4_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0957336 7.254730 -12.131912 2.0957336   16.32338 2.0957336
## months2    1.8665422 7.254732 -12.361107 1.8665422   16.09419 1.8665422
## months3    1.8881939 7.254732 -12.339457 1.8881939   16.11584 1.8881939
## months4    1.6936608 7.254734 -12.533993 1.6936608   15.92131 1.6936608
## months5    1.6149959 7.254735 -12.612660 1.6149959   15.84265 1.6149959
## months6    1.4918392 7.254736 -12.735819 1.4918392   15.71950 1.4918392
## months7    1.4522483 7.254743 -12.775424 1.4522483   15.67992 1.4522483
## months8    1.4305728 7.254743 -12.797099 1.4305728   15.65824 1.4305728
## months9    1.3893551 7.254745 -12.838320 1.3893551   15.61703 1.3893551
## months10   1.4696386 7.254743 -12.758033 1.4696386   15.69731 1.4696386
## months11   1.5129659 7.254743 -12.714705 1.5129659   15.74064 1.5129659
## months12   1.7719669 7.254739 -12.455696 1.7719669   15.99963 1.7719669
## Intercept1 2.1497282 7.254774 -12.078004 2.1497282   16.37746 2.1497282
## Intercept2 0.6792418 7.254919 -13.548775 0.6792418   14.90726 0.6792418
## Intercept3 2.0655784 7.254811 -12.162227 2.0655784   16.29338 2.0655784
## Intercept4 4.2727382 7.254780  -9.955006 4.2727383   18.50048 4.2727383
## Intercept5 3.6462192 7.254802 -10.581568 3.6462192   17.87401 3.6462192
## Intercept6 1.8590593 7.254768 -12.368661 1.8590593   16.08678 1.8590593
## Intercept7 5.0051480 7.254736  -9.222509 5.0051480   19.23280 5.0051480
##                     kld
## months1    5.527847e-11
## months2    5.527843e-11
## months3    5.527829e-11
## months4    5.527854e-11
## months5    5.527838e-11
## months6    5.527823e-11
## months7    5.527840e-11
## months8    5.527826e-11
## months9    5.527850e-11
## months10   5.527826e-11
## months11   5.527854e-11
## months12   5.527833e-11
## Intercept1 5.527820e-11
## Intercept2 5.527841e-11
## Intercept3 5.527844e-11
## Intercept4 5.527837e-11
## Intercept5 5.527829e-11
## Intercept6 5.527829e-11
## Intercept7 5.527838e-11
kgr_model4_fit$bri_hyperpar_summary
##                 mean         sd    q0.025     q0.5   q0.975      mode
## SD for id2 0.6258224 0.04871945 0.5346796 0.624133 0.726037 0.6209287
kgr_model4_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.131404   6.465900   6.607424   5.439357   5.027867   4.445264   4.272710 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.181093   4.012262   4.347664   4.540177   5.882412   8.582525   1.972382 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.889860  71.717748  38.329475   6.417697 149.179156
kgr_model4_fit$K_EPA_weight
## [1] 0.0806733
kgr_model4_fit$K_time_weight
## [1] 0.9193267
kgr_model4_fit$K_weight
## [1] 0.8846952
kgr_model4_fit$gfilter_weight
## [1] 0.1153048
kgr_model4_fit$num_jitters
## [1] 1
#Show plots
kgr_model4_fit$K_time_heatmap

kgr_model4_fit$K_EPA_heatmap

kgr_model4_fit$prec_heatmap

kgr_model4_fit$param_plot

kgr_model4_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model4_fit$fitted_values)

Proposed model 5

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{it} | \textbf{X} = exp(\beta_0 + F_{it})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,\frac{1}{3}(K_{AR} + K_{DL} + K_{Interaction}) \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = (k(t_1,t_2)+k(x_{t_1},x_{t_2}))(H^2)_{n_1,n_2}\).

kgr_model5 = function(dataset, rho_AR_rbf = 1, rho_AR_periodic = 1, rho_DL_rbf = 1, rho_DL_periodic = 1,
                      rho_int_rbf = 1, rho_int_periodic = 1, sigma2_AR = 1, sigma2_DL = 1, sigma2_int = 1, link=1){
  
  #Calculating gram matrix K_AR
  K_AR_cluster = list()
  K_AR_periodic_cluster = list()
  
  for (c in 1:num_clus){
    
    #Grab S_random data for cluster c
    cluster_data = decomposed_cluster_data[[c]]
    S_random_clus = cluster_data$S_random
    
    #Create a list to contain covariance matrix for each pollutant (8)
    K_AR_list = list()
    K_AR_periodic_list = list()
    
    time_span = nrow(S_random_clus)
    
    #Calculate a AR 1 covariance matrix for each pollutant and store in list
    for (i in 1:8){
      ts =  S_random_clus[,i]
    
      K_covariate = matrix(nrow=time_span,ncol=time_span)
      K_covariate_periodic = matrix(nrow=time_span,ncol=time_span)
      
      for(j in 1:time_span){
        for (k in 1:time_span){
          if (abs(j-k) <= 1){
            
            K_covariate[j,k] = exp(- ((ts[j] - ts[k])^2) #RBF kernel 
                                 / (2*rho_AR_rbf)) * sigma2_AR
            
            K_covariate_periodic[j,k] = exp(- ((ts[j] - ts[k])^2) #Locally periodic kernel 
                         / (2*rho_AR_rbf)) * exp(- (2*sin((abs(ts[j] - ts[k]))*pi/12)^2)
                         / (rho_AR_periodic)) * sigma2_AR
          }
          else{
            K_covariate_periodic[j,k] = 0
            K_covariate[j,k] = 0
            }
        }
      }
      
      K_AR_list[[i]] = K_covariate
      K_AR_periodic_list[[i]] = K_covariate_periodic
    }
    
    names(K_AR_list) = colnames(S_random_clus)
    names(K_AR_periodic_list) = colnames(S_random_clus)
    
    #Add each pollutant's covariance matrix to get AR 1 matrix for each cluster
    K_AR = matrix(0,nrow=60,ncol=60)
    K_AR_periodic = matrix(0,nrow=60,ncol=60)
    
    for(i in 1:length(K_AR_periodic_list)){
      K_AR = K_AR + ((1/8)*K_AR_list[[i]])
      K_AR_periodic = K_AR_periodic + ((1/8)*K_AR_periodic_list[[i]])
    }
    
    K_AR_cluster[[c]] = K_AR
    K_AR_periodic_cluster[[c]] = K_AR_periodic
  }
  
  K_AR = matrix(0,nrow=60,ncol=60)
  K_AR_periodic = matrix(0,nrow=60,ncol=60)
  
  for(i in 1:num_clus){
    K_AR = K_AR + ((1/num_clus)*K_AR_cluster[[i]])
    K_AR_periodic = K_AR_periodic + ((1/num_clus)*K_AR_periodic_cluster[[i]])
  }
  
  K_AR_norm = norm(K_AR_periodic,type = "F")
  
  #Heatmap of resulting K 
  # K_AR_heatmap = corrplot(K_AR, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  # title = "AR 1 Covariance Structure")
  # K_AR_heatmap = corrplot(K_AR_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic AR 1 Covariance Structure")
  
  # K_AR_heatmap = matrix_heatmap(K_AR,title = "AR 1 Covariance Structure")
  K_AR_heatmap = matrix_heatmap(K_AR_periodic,title = "Periodic AR 1 Covariance Structure")

  ###Calculating gram matrix K_DL
  K_DL_cluster = list()
  K_DL_periodic_cluster = list()
  
  for (c in 1:num_clus){
    
    #Grab S_DL data for cluster c
    cluster_data = decomposed_cluster_data[[c]]
    S_DL_clus = cluster_data$S_DL
    
    #Create a list to store covariance matrix for each DL 
    K_DL_list = list()
    K_DL_periodic_list = list()
    
    dl_lags = c(3,6,12)
    tracker = 1
    
    for (i in dl_lags){
      
      K_DL = matrix(nrow=time_span,ncol=time_span)
      K_DL_periodic = matrix(nrow=time_span,ncol=time_span)
      
      #Calculate DL covariance matrix for specified lag   
      for(j in 1:nrow(S_DL_clus)){
        for (k in 1:nrow(S_DL_clus)){
          
          if ((abs(j-k) == 0) || (abs(j-k) == i)){
            
            K_DL[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2) / (2*rho_DL_rbf)) * sigma2_DL
            
            K_DL_periodic[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2)
                                 / (2*rho_DL_rbf)) * exp(- (2*sin(sum(abs(S_DL_clus[j,] - S_DL_clus[k,]))*pi/12)^2)
                                 / (rho_DL_periodic)) * sigma2_DL
            
          } 
          else{
            K_DL_periodic[j,k] = 0
            K_DL[j,k] = 0
            }
        }
      }
      
      K_DL_list[[tracker]] = K_DL
      K_DL_periodic_list[[tracker]] = K_DL_periodic
      tracker = tracker+1
    }
    
    #Combine the 3 DL covariance matrices together
    K_DL = matrix(0,nrow=time_span,ncol=time_span)
    K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
    
    for(i in 1:length(K_DL_periodic_list)){
      K_DL = K_DL + ((1/3)*K_DL_list[[i]])
      K_DL_periodic = K_DL_periodic + ((1/3)*K_DL_periodic_list[[i]])
    }
    
    #Store DL(3,6,12) covariance matrix for each cluster 
    K_DL_cluster[[c]] = K_DL
    K_DL_periodic_cluster[[c]] = K_DL
  }
  
  K_DL = matrix(0,nrow=time_span,ncol=time_span)
  K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
  
  for(i in 1:num_clus){
    K_DL = K_DL + ((1/num_clus)*K_DL_cluster[[i]])
    K_DL_periodic = K_DL_periodic + ((1/num_clus)*K_DL_periodic_cluster[[i]])
  }
  
  K_DL_norm = norm(K_DL_periodic,type = "F")
  
  #Heatmap of resulting K 
  # K_DL_heatmap = corrplot(K_DL, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
  # title = "DL (3,6,12) Covariance Structure")
  # K_DL_heatmap = corrplot(K_DL_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic DL (3,6,12) Covariance Structure")
  
  # K_DL_heatmap = matrix_heatmap(K_DL,title = "DL (3,6,12) Covariance Structure")
  K_DL_heatmap = matrix_heatmap(K_DL_periodic,title = "Periodic DL (3,6,12) Covariance Structure")
  
  ###Calculating gram matrix K_Interaction
  K_Interaction_cluster = list()
  K_Interaction_periodic_cluster = list()
  
  for (c in 1:num_clus){
    
    #Grab interaction pair data for cluster c
    cluster_data = decomposed_cluster_data[[c]]
    W2_clus = cluster_data$W2
    
    K_interaction_list = list()
    K_interaction_periodic_list = list()
    
    column_names = colnames(W2_clus)
    time_span = nrow(W2_clus)
    
    #Create sequence of indices corresponding to comparisons for real time and one lag interaction effects
    lag0_idx = seq(2,3601,by=61)
    lag1_idx = seq(1,3600,by=61)
    
    #Calculate a kernel for each interaction pair 
    for (a in 1:length(column_names)){
      interaction =  W2_clus[,a]
      
      #First calculate these two interaction kernels separately 
      K_int0 = matrix(nrow = 60,ncol = 60)
      K_int1 = matrix(nrow = 60,ncol = 60)
      
      K_int0_periodic = matrix(nrow = 60,ncol = 60)
      K_int1_periodic = matrix(nrow = 60,ncol = 60)
      
      for (i in 1:60){
        for (j in 1:60){
          
          #RBF kernels
          K_int0[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                              / (2*rho_int_rbf)) * sigma2_int
    
          K_int1[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                              / (2*rho_int_rbf)) * sigma2_int
          
          #Locally periodic kernels 
          K_int0_periodic[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
                          / (2*rho_int_rbf)) * 
            exp(- (2*sin((abs(interaction[lag0_idx[i]] - interaction[lag0_idx[j]]))*pi/12)^2)
                          / (rho_int_periodic)) * sigma2_int
              
          K_int1_periodic[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
                               / (2*rho_int_rbf)) * 
            exp(- (2*sin((abs(interaction[lag1_idx[i]] - interaction[lag1_idx[j]]))*pi/12)^2)
                          / (rho_int_periodic)) * sigma2_int
        }
      }
      
      #Combine real time and one lag interaction kernels together
      K_interaction = 0.5*K_int0 + 0.5*K_int1
      K_interaction_list[[a]] = K_interaction
      
      K_interaction_periodic = 0.5*K_int0_periodic + 0.5*K_int1_periodic
      K_interaction_periodic_list[[a]] = K_interaction_periodic
    }
    
    #Combine kernels for each interaction pair together
    K_interaction = matrix(0,nrow=60,ncol=60)
    K_interaction_periodic = matrix(0,nrow=60,ncol=60)
    
    for(i in 1:length(K_interaction_periodic_list)){
      K_interaction = K_interaction + ((1/length(K_interaction_list))*K_interaction_list[[i]])
      
      K_interaction_periodic = K_interaction_periodic + ((1/length(K_interaction_periodic_list))*K_interaction_periodic_list[[i]])
    }

    #Store final interaction kernel (for all pairs) for each cluster 
    K_Interaction_cluster[[c]] = K_interaction
    K_Interaction_periodic_cluster[[c]] = K_interaction_periodic
  }
  
  K_interaction = matrix(0,nrow=60,ncol=60)
  K_interaction_periodic = matrix(0,nrow=60,ncol=60)
  
  for(i in 1:num_clus){
    K_interaction = K_interaction + ((1/length(K_Interaction_cluster))*K_Interaction_cluster[[i]])
    
    K_interaction_periodic = K_interaction_periodic + ((1/length(K_Interaction_periodic_cluster))*K_Interaction_periodic_cluster[[i]])
  }
  
  K_Int_norm = norm(K_interaction_periodic,type = "F")
  
  #Heatmap of resulting K 
  # K_Interaction_heatmap = corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Interaction Covariance Structure")
  # K_Interaction_heatmap = corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic Interaction Covariance Structure")
  
  # K_Interaction_heatmap = matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
  K_Interaction_heatmap = matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")
  
  gram = (1/3)*(K_AR_periodic+K_DL_periodic+K_interaction_periodic)
  K_weight = norm((1/60)*gram,type = "F")
  
  ###Load graph regression kernel 
  # covGP5 = kronecker(gram,H^2)
  covGP5 = kronecker(gram/60,(H^2)/7)
  
  #Need to ensure precision matrix is not computationally singular i.e det > 0
  covGP_jittered = desingularize(covGP5,threshold = 1e-2,increment = 0.01)
  covGP5 = covGP_jittered[[1]]
  
  inv_covGP5 = solve(covGP5)
  
  #Heatmap of resulting K 
  inv_covGP5_heatmap = matrix_heatmap(inv_covGP5,title = "Precision matrix heatmap")
  
  
  ###Fit INLA model 
  # kgr_formula5 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
  #   Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP5)
  
  kgr_formula5 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
    Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP5)
  
  model = inla(formula = kgr_formula5,family = "poisson",data = dataset,
                  control.compute = list(dic=TRUE,waic=TRUE,
                                         return.marginals.predictor=TRUE),
                  control.inla = list(strategy = "laplace"),
                  control.predictor = list(compute = TRUE, link = link))
  
  ###Extract relevant information and store in the list
  model_summary <- model$summary.fixed
  bri_hyperpar_summary <- bri.hyperpar.summary(model)
  model_DIC <- model$dic$dic
  model_WAIC <- model$waic$waic
  preds_model <- model$summary.fitted.values
  preds_model <- cbind(dataset$id, dataset$time, preds_model)
  colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
  marginal_fvs <- model$marginals.fitted.values
  
  #Exponentiating parameter to get better interpretation of estimates 
  multeff <- exp(model$summary.fixed$mean)
  names(multeff) <- model$names.fixed
  
  #Plot of each parameters' posterior density 
  mf <- melt(model$marginals.fixed)
  cf <- spread(mf,Var2,value)
  names(cf)[2] <- 'parameter'
  param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter, 
             scales="free") + geom_vline(xintercept=0) + ylab("density")
  
  #Plot of precision of random effect (main hyperparameter of interest)
  sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
  hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") + 
             xlab("linear predictor")
  
  #Store the results in the list
  kgr_model5_results = list(
    K_AR_heatmap = K_AR_heatmap, 
    K_DL_heatmap = K_DL_heatmap,
    K_Interaction_heatmap = K_Interaction_heatmap,
    K_AR_weight = K_AR_norm / (K_AR_norm + K_DL_norm + K_Int_norm),
    K_DL_weight = K_DL_norm / (K_AR_norm + K_DL_norm + K_Int_norm),
    K_Int_weight = K_Int_norm / (K_AR_norm + K_DL_norm + K_Int_norm),
    K_weight = K_weight/(K_weight + gfilter_weight),
    gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
    covmatrix = covGP5,
    prec = inv_covGP5,
    num_jitters = covGP_jittered[[2]],
    prec_heatmap = inv_covGP5_heatmap,
    model_summary = model_summary,
    bri_hyperpar_summary = bri_hyperpar_summary,
    exp_effects = multeff,
    param_plot = param_plot,
    hyperparam_plot = hyperparam_plot,
    model_DIC = model_DIC,
    model_WAIC = model_WAIC,
    fitted_values = preds_model,
    marg_fitted_values = marginal_fvs
  )
  
  return(kgr_model5_results)
}

#Fit kgr_model5
kgr_model5_fit = kgr_model5(dataset = inla_insample_data, rho_AR_rbf = 0.003, rho_AR_periodic = 0.003,
                            rho_DL_rbf = 0.011, rho_DL_periodic = 0.013, rho_int_rbf = 0.007,
                            rho_int_periodic = 0.008, sigma2_AR = 2.281, sigma2_DL = 2.650, sigma2_int = 0.448, link=1)

#Extract DIC and WAIC 
kgr_model5_DIC = kgr_model5_fit$model_DIC
kgr_model5_WAIC = kgr_model5_fit$model_WAIC

#Get summaries of parameter estimates
kgr_model5_fit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0865646 7.254732 -12.141086 2.0865646   16.31421 2.0865646
## months2    1.8606007 7.254733 -12.367052 1.8606007   16.08825 1.8606007
## months3    1.8877047 7.254733 -12.339948 1.8877047   16.11536 1.8877047
## months4    1.6991779 7.254734 -12.528476 1.6991779   15.92683 1.6991779
## months5    1.6159844 7.254736 -12.611673 1.6159844   15.84364 1.6159844
## months6    1.4945327 7.254736 -12.733126 1.4945327   15.72219 1.4945327
## months7    1.4667381 7.254743 -12.760934 1.4667381   15.69441 1.4667381
## months8    1.4374114 7.254744 -12.790263 1.4374114   15.66509 1.4374114
## months9    1.4008301 7.254744 -12.826843 1.4008301   15.62850 1.4008301
## months10   1.4737520 7.254744 -12.753920 1.4737520   15.70142 1.4737520
## months11   1.5211576 7.254742 -12.706512 1.5211576   15.74883 1.5211576
## months12   1.7737113 7.254740 -12.453954 1.7737113   16.00138 1.7737113
## Intercept1 2.1518091 7.254746 -12.075868 2.1518091   16.37949 2.1518091
## Intercept2 0.6789171 7.254850 -13.548965 0.6789171   14.90680 0.6789171
## Intercept3 2.0715469 7.254760 -12.156159 2.0715469   16.29925 2.0715469
## Intercept4 4.2902855 7.254735  -9.937369 4.2902855   18.51794 4.2902855
## Intercept5 3.6573255 7.254744 -10.570348 3.6573255   17.88500 3.6573255
## Intercept6 1.8615872 7.254749 -12.366097 1.8615872   16.08927 1.8615872
## Intercept7 5.0066943 7.254718  -9.220928 5.0066943   19.23432 5.0066943
##                     kld
## months1    5.527843e-11
## months2    5.527841e-11
## months3    5.527841e-11
## months4    5.527840e-11
## months5    5.527851e-11
## months6    5.527823e-11
## months7    5.527853e-11
## months8    5.527824e-11
## months9    5.527838e-11
## months10   5.527826e-11
## months11   5.527841e-11
## months12   5.527844e-11
## Intercept1 5.527835e-11
## Intercept2 5.527852e-11
## Intercept3 5.527826e-11
## Intercept4 5.527839e-11
## Intercept5 5.527838e-11
## Intercept6 5.527843e-11
## Intercept7 5.527821e-11
kgr_model5_fit$bri_hyperpar_summary
##               mean        sd   q0.025     q0.5   q0.975     mode
## SD for id2 1.60343 0.1181912 1.381791 1.599504 1.846087 1.592017
kgr_model5_fit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.057188   6.427597   6.604193   5.469449   5.032840   4.457253   4.335072 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.209784   4.058568   4.365584   4.577521   5.892683   8.600403   1.971741 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.937092  72.987303  38.757548   6.433941 149.410020
kgr_model5_fit$K_AR_weight
## [1] 0.3472991
kgr_model5_fit$K_DL_weight
## [1] 0.397453
kgr_model5_fit$K_Int_weight
## [1] 0.2552478
kgr_model5_fit$K_weight
## [1] 0.5113507
kgr_model5_fit$gfilter_weight
## [1] 0.4886493
kgr_model5_fit$num_jitters
## [1] 0
#Show plots
kgr_model5_fit$K_AR_heatmap

kgr_model5_fit$K_DL_heatmap

kgr_model5_fit$K_Interaction_heatmap

kgr_model5_fit$prec_heatmap

kgr_model5_fit$param_plot

kgr_model5_fit$hyperparam_plot

pp_insample_plot(pred_data = kgr_model5_fit$fitted_values)

Comparing in DIC and WAIC between models

With INLA, we can obtain the deviance information criterion (DIC) and the widely appliciable (or Watanabe-Akaike) information criterion (WAIC) which have the following formulas:

\(DIC = \bar D + p_D\) where the first term is the posterior mean deviance i.e., a measure of fit \(\bar D = E_{\theta | y} [D(\theta)]\) and the second term is the effective number of parameters i.e. a measure of model complexity \(p_D = E_{\theta | y} [D(\theta)] - D(E_{\theta | y}[\theta]) = \bar D - D(\bar \theta)\)

where \(D(\theta) = -2 log(p(y | \theta))\)

\(WAIC = T_n + \frac{V_n}{n}\) where \(T_n = -\frac{1}{n} \sum_{i=1}^n log p^*(Y_i | w)\) and \(V_n = \sum_{i=1}^n \{ E_w[(log p(Y_i | w))^2] - E_w[log p(Y_i | w)]^2 \}\)

where \(T_n\) is the log loss function and \(w\) is are the parameters in our model.

Also note that for both criteria, the smaller the value, the better the model

infocrit_table = matrix(nrow = 8,ncol = 2)

dics = c(ref_model1_DIC,ref_model2_DIC,kgr_model1_DIC,
         kgr_model2_DIC,kgr_model3_DIC,kgr_model4_DIC,kgr_model5_DIC)

waics = c(ref_model1_WAIC,ref_model2_WAIC,kgr_model1_WAIC,
          kgr_model2_WAIC,kgr_model3_WAIC,kgr_model4_WAIC,kgr_model5_WAIC)

infocrit_table = cbind(dics,waics)
colnames(infocrit_table) = c("DIC","WAIC")
rownames(infocrit_table) = c("Poisson GLM model","BYM model",
                             "Proposed KGR model 1","Proposed KGR model 2",
                             "Proposed KGR model 3","Proposed KGR model 4",
                             "Proposed KGR model 5")

infocrit_table = data.frame(infocrit_table)
infocrit_table
##                           DIC     WAIC
## Poisson GLM model    2869.994 4065.576
## BYM model            2869.994 4065.576
## Proposed KGR model 1 2842.645 2822.394
## Proposed KGR model 2 2840.079 2820.259
## Proposed KGR model 3 2841.500 2820.577
## Proposed KGR model 4 2841.736 2820.412
## Proposed KGR model 5 2841.843 2815.817

Comparing in sample RMSE for different clusters between models

One way to compare performance between the models fit above is to calculate RMSEs for each model’s fit on each cluster’s time series. Since INLA makes predictions based on the posterior predictive distribution, I actually calculated two sets of RMSEs. The first one is the RMSE of the predictions made by each model on the observed training data points i.e. not months 55-60. These were the observations that the models were fit on so we would expect small discrepancies between the observed values and the posterior predictive means for those time periods. The second one is the RMSE of the predictions made by each model on the test data points i.e. months 55-60. There was a lot more variation in the RMSEs calculated for these data points obviously.

Another important thing to note here is that the RMSEs calculated for each cluster were drastically different because the population sizes between clusters varied by a lot (think thousands compared to hundred thousands). So in order to make actual comparisons, the RMSEs had to be scaled which involves dividing the calculated RMSE by the average of the actual observed data points. The resulting RMSE values for each cluster, which are presented in tables below, can now be interpreted relative to the average number of respiratory related deaths in that cluster.

#Overall fit
RMSE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_insample_data %>% filter(id == i) %>% select(response) %>% data.frame()
  
  actual.mean = mean(actual$response)

  pm_1 = ref_model2_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_fvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  
  rmse1 = sqrt(mean((actual.mean - pm_1$mean)^2))
  rmse2 = sqrt(mean((actual.mean - pm_2$mean)^2))
  rmse3 = sqrt(mean((actual.mean - pm_3$mean)^2))
  rmse4 = sqrt(mean((actual.mean - pm_4$mean)^2))
  rmse5 = sqrt(mean((actual.mean - pm_5$mean)^2))
  rmse6 = sqrt(mean((actual.mean - pm_6$mean)^2))
  rmse7 = sqrt(mean((actual.mean - pm_7$mean)^2))

  RMSE_table[,i] = c(rmse1,rmse2,rmse3,rmse4,rmse5,rmse6,rmse7) 
  RMSE_table[,i] = RMSE_table[,i] / actual.mean
}

#Table 1: In sample RMSE
RMSE_table = data.frame(RMSE_table)

colnames(RMSE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
                         "Cluster 5","Cluster 6","Cluster 7")
rownames(RMSE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

RMSE_table
##                      Cluster 1   Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model            0.2294095 0.229564523 0.2294132 0.2293690 0.2293738
## LGCP model           0.2049814 0.001180824 0.1760954 0.2474671 0.2674100
## Proposed KGR model 1 0.2314481 0.224259746 0.2305573 0.2477849 0.2557097
## Proposed KGR model 2 0.2315776 0.208855195 0.2264184 0.2482402 0.2610482
## Proposed KGR model 3 0.2305485 0.216207947 0.2274065 0.2480105 0.2594430
## Proposed KGR model 4 0.2310047 0.220744251 0.2287939 0.2478884 0.2578252
## Proposed KGR model 5 0.2281278 0.206960185 0.2259589 0.2501442 0.2663319
##                      Cluster 6 Cluster 7
## BYM model            0.2294250 0.2293662
## LGCP model           0.2197868 0.2319521
## Proposed KGR model 1 0.2370027 0.2351349
## Proposed KGR model 2 0.2356730 0.2341821
## Proposed KGR model 3 0.2354984 0.2345411
## Proposed KGR model 4 0.2360795 0.2347908
## Proposed KGR model 5 0.2291305 0.2318024

OUT OF SAMPLE FITTING (FORECASTING) ANALYSIS

Plots of true mortality values

true_mortality = inla_full_data
true_mortality$time = as.numeric(true_mortality$time)

#Combine plots with library patchwork
true1 = true_mortality %>% filter(id == 1) %>% ggplot(aes(x=time,y=response)) + geom_line()

true2 = true_mortality %>% filter(id == 2) %>% ggplot(aes(x=time,y=response)) + geom_line() 

true3 = true_mortality %>% filter(id == 3) %>% ggplot(aes(x=time,y=response)) + geom_line() 

true4 = true_mortality %>% filter(id == 4) %>% ggplot(aes(x=time,y=response)) + geom_line() 

true5 = true_mortality %>% filter(id == 5) %>% ggplot(aes(x=time,y=response)) + geom_line()

true6 = true_mortality %>% filter(id == 6) %>% ggplot(aes(x=time,y=response)) + geom_line()

true7 = true_mortality %>% filter(id == 7) %>% ggplot(aes(x=time,y=response)) + geom_line()

true1 + true2 + true3 + true4 + true5 + true6 + true7

#Write a function to make plot of posterior predictive estimates with credible interval bands OVERLAID on response
pp_outsample_plot = function(num_plots = num_clus, ref_data = inla_full_data, pred_data){
  for (i in 1:num_plots){
  df = ref_data %>% filter(id == i) %>% select(response)
  preds = pred_data %>% filter(id == i) 
  df = cbind(df,preds)
  
  title = sprintf("Posterior Predictive Fits for Cluster %s",i)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_ribbon(aes(ymin = `0.025quant`,ymax = `0.975quant`),alpha = 0.3) + geom_vline(xintercept = 54,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
  print(post_pred_plot)
  }
}

Fit a simple Poisson GLMM for our mortality data (Reference model 1)

We wanted to compare the performance of our proposed model with a few reference models. The first one is a Poisson generalized linear mixed model. This model assumes the observed data follows a Poisson distribution and the hyperparameter \(\lambda_i\) can be modeled using a mixed effects model with a log link.

In other words,

\(Y_{i,t} \sim Pois(\lambda_{i,t})\) for \(i=1,...,7\) and \(t=1,...,60\) where \(log(\lambda_{i,t}) = \beta_0 + \beta_1 * I \{t=2,14,...,62 \} + ... + \beta_{11} * I \{t=12,24,...,72 \} + u_i\)

where the random effect \(u_i \sim MVN(0,\tau \Sigma)\)

We wanted the first reference model to be simple, so we assumed that the random effects \(u_i\) are iid. This means that \(\Sigma\) is simply a diagonal matrix of scaling factors. The hyperparameter \(log(\tau)\) is by default assigned a \(log \; \Gamma(1,0.00005)\) prior.

#Run model 
ref_model1_outfit = ref_model1(inla_outsample_data, a_prior=1, b_prior=1e-5)

#Extract DIC and WAIC
ref_model1_DIC = ref_model1_outfit$model_DIC
ref_model1_WAIC = ref_model1_outfit$model_WAIC

#Get summaries of parameter estimates
ref_model1_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.1040003 7.254715 -12.123616 2.1040003   16.33162 2.1040003
## months2    1.8618778 7.254716 -12.365741 1.8618778   16.08950 1.8618778
## months3    1.8812186 7.254716 -12.346400 1.8812186   16.10884 1.8812186
## months4    1.6930269 7.254717 -12.534594 1.6930269   15.92065 1.6930269
## months5    1.6124060 7.254718 -12.615217 1.6124060   15.84003 1.6124060
## months6    1.4969699 7.254719 -12.730655 1.4969699   15.72459 1.4969699
## months7    1.4681960 7.254722 -12.759434 1.4681960   15.69583 1.4681960
## months8    1.4344608 7.254722 -12.793170 1.4344608   15.66209 1.4344608
## months9    1.4008388 7.254723 -12.826793 1.4008388   15.62847 1.4008388
## months10   1.4724090 7.254722 -12.755221 1.4724090   15.70004 1.4724090
## months11   1.5162774 7.254721 -12.711351 1.5162774   15.74391 1.5162774
## months12   1.7814823 7.254719 -12.446141 1.7814823   16.00911 1.7814823
## Intercept1 2.1524562 7.254737 -12.075203 2.1524562   16.38012 2.1524562
## Intercept2 0.6814394 7.254819 -13.546382 0.6814394   14.90926 0.6814394
## Intercept3 2.0751109 7.254739 -12.152552 2.0751109   16.30277 2.0751109
## Intercept4 4.2895290 7.254715  -9.938087 4.2895290   18.51715 4.2895290
## Intercept5 3.6523905 7.254717 -10.575231 3.6523905   17.88001 3.6523905
## Intercept6 1.8619689 7.254745 -12.365706 1.8619689   16.08964 1.8619689
## Intercept7 5.0102692 7.254713  -9.217344 5.0102692   19.23788 5.0102692
##                     kld
## months1    5.527842e-11
## months2    5.527840e-11
## months3    5.527840e-11
## months4    5.527839e-11
## months5    5.527837e-11
## months6    5.527849e-11
## months7    5.527831e-11
## months8    5.527831e-11
## months9    5.527844e-11
## months10   5.527832e-11
## months11   5.527845e-11
## months12   5.527837e-11
## Intercept1 5.527850e-11
## Intercept2 5.527831e-11
## Intercept3 5.527820e-11
## Intercept4 5.527819e-11
## Intercept5 5.527837e-11
## Intercept6 5.527837e-11
## Intercept7 5.527841e-11
ref_model1_outfit$bri_hyperpar_summary
##                  mean          sd      q0.025        q0.5     q0.975
## SD for id 0.005177216 0.004471659 0.001635036 0.003776737 0.01824026
##                  mode
## SD for id 0.002548752
ref_model1_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.198903   6.435810   6.561496   5.435910   5.014863   4.468129   4.341396 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.197381   4.058603   4.359725   4.555236   5.938653   8.605970   1.976721 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.965429  72.932108  38.566749   6.436397 149.945090
#Show plots
ref_model1_outfit$param_plot

ref_model1_outfit$hyperparam_plot

pp_outsample_plot(pred_data = ref_model1_outfit$fitted_values)

Fitting a Besag-York-Mollie model (Reference model 2)

For our second reference model, we decided to fit a Besag-York-Mollie model, which is a log-normal Poisson model with an intrinsic conditional autoregressive component to capture spatial autocorrelations i.e. a Besag model, plus a standard random effects term which is included to capture non-spatial heterogeneity. Obviously, this model is less naive than reference model 1 because it does not assume iid random effects.

The BYM model can be written as,

\(Y_{i,t} \sim Pois(\lambda_{i,t})\) for \(i=1,...,7\) and \(t=1,...,60\) where \(log(\lambda_{i,t}) = \beta_0 + \beta_1 * I \{t=2,14,...,62 \} + ... + \beta_{11} * I \{t=12,24,...,72 \} + \phi + u_i\)

where \(p(\phi) \propto exp(-\frac{1}{2} \sum_{i \sim j} (\phi_i - \phi_j)^2)\) and \(u_i \sim MVN(0,\tau \Sigma)\).

Note: it is more commonly known that ICAR components are conditionally normally distributed.

As one can see below, the summary outputs indicate that this model is very similar to the Poisson GLMM (reference model 1). The intercept and SD for the random effect component are estimated to almost the exact same as those estimated by the Poisson GLMM, indicating that including the spatial ICAR component is seemingly not very impactful.

#Run model 
ref_model2_outfit = ref_model2(dataset = inla_outsample_data,a_prec_prior = 1,b_prec_prior = 1e-5,
                            a_phi_prior = 2,b_phi_prior = 0.5005)

#Extract DIC and WAIC
ref_model2_DIC = ref_model2_outfit$model_DIC
ref_model2_WAIC = ref_model2_outfit$model_WAIC

#Get summaries of parameter estimates
ref_model2_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.1040002 7.254715 -12.123616 2.1040002   16.33162 2.1040002
## months2    1.8618776 7.254716 -12.365741 1.8618776   16.08950 1.8618776
## months3    1.8812185 7.254716 -12.346400 1.8812185   16.10884 1.8812185
## months4    1.6930268 7.254717 -12.534594 1.6930268   15.92065 1.6930268
## months5    1.6124059 7.254718 -12.615217 1.6124059   15.84003 1.6124059
## months6    1.4969697 7.254719 -12.730655 1.4969697   15.72459 1.4969697
## months7    1.4681958 7.254722 -12.759434 1.4681958   15.69583 1.4681958
## months8    1.4344607 7.254722 -12.793170 1.4344607   15.66209 1.4344607
## months9    1.4008387 7.254723 -12.826793 1.4008387   15.62847 1.4008387
## months10   1.4724089 7.254722 -12.755221 1.4724089   15.70004 1.4724089
## months11   1.5162773 7.254721 -12.711352 1.5162773   15.74391 1.5162773
## months12   1.7814822 7.254719 -12.446141 1.7814822   16.00911 1.7814822
## Intercept1 2.1525450 7.259066 -12.083608 2.1525448   16.38870 2.1525446
## Intercept2 0.6814789 7.260532 -13.557552 0.6814784   14.92051 0.6814775
## Intercept3 2.0752552 7.260396 -12.163509 2.0752549   16.31402 2.0752545
## Intercept4 4.2894110 7.260285  -9.949138 4.2894114   18.52796 4.2894120
## Intercept5 3.6522438 7.262452 -10.590563 3.6522443   17.89505 3.6522453
## Intercept6 1.8620586 7.259134 -12.374229 1.8620585   16.09835 1.8620582
## Intercept7 5.0101701 7.259045  -9.225943 5.0101703   19.24628 5.0101708
##                     kld
## months1    5.527842e-11
## months2    5.527840e-11
## months3    5.527827e-11
## months4    5.527838e-11
## months5    5.527824e-11
## months6    5.527836e-11
## months7    5.527832e-11
## months8    5.527831e-11
## months9    5.527844e-11
## months10   5.527832e-11
## months11   5.527846e-11
## months12   5.527837e-11
## Intercept1 5.512345e-11
## Intercept2 5.500941e-11
## Intercept3 5.501457e-11
## Intercept4 5.502247e-11
## Intercept5 5.478658e-11
## Intercept6 5.511931e-11
## Intercept7 5.512336e-11
ref_model2_outfit$bri_hyperpar_summary
##                                      mean          sd      q0.025        q0.5
## SD for id (idd component)     0.004715936 0.002874206 0.001728585 0.003884144
## SD for id (spatial component) 0.609526015 0.237151368 0.303609741 0.555125478
##                                  q0.975       mode
## SD for id (idd component)     0.0124817 0.00281349
## SD for id (spatial component) 1.2168261 0.46351753
ref_model2_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.198902   6.435810   6.561495   5.435909   5.014862   4.468129   4.341396 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.197381   4.058603   4.359725   4.555236   5.938652   8.606734   1.976799 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.966579  72.923507  38.561092   6.436974 149.930235
#Show plots
ref_model2_outfit$param_plot

ref_model2_outfit$hyperparam_plot

pp_outsample_plot(pred_data = ref_model2_outfit$fitted_values)

Fitting kernel graph regression models

KGR model with time series kernel x graph filter (Proposed model 2)

Finally, we fit our proposed model which we call a kernel graph regression model. It also takes the form of a latent Gaussian model as shown below:

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{i,t} | \textbf{X} = exp(\beta_0 + F_{i,t})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,K \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(x_{t_1},x_{t_2})(H^2)_{n_1,n_2}\).

#Run model 
kgr_model2_outfit = kgr_model2(data = inla_outsample_data,rho_EPA_rbf = 2429.591, rho_EPA_periodic = 1612.206, sigma2_EPA = 4.918)

#Extract DIC and WAIC
kgr_model2_DIC = kgr_model2_outfit$model_DIC
kgr_model2_WAIC = kgr_model2_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model2_outfit$model_summary
##                mean       sd 0.025quant 0.5quant 0.975quant     mode
## months1    2.103662 7.254733 -12.123990 2.103662   16.33131 2.103662
## months2    1.863340 7.254733 -12.364312 1.863340   16.09099 1.863340
## months3    1.886610 7.254734 -12.341044 1.886610   16.11426 1.886610
## months4    1.694695 7.254736 -12.532962 1.694695   15.92235 1.694695
## months5    1.615105 7.254738 -12.612557 1.615105   15.84277 1.615105
## months6    1.493718 7.254738 -12.733944 1.493718   15.72138 1.493718
## months7    1.456952 7.254746 -12.770724 1.456952   15.68463 1.456952
## months8    1.434437 7.254746 -12.793240 1.434437   15.66211 1.434437
## months9    1.394261 7.254747 -12.833418 1.394261   15.62194 1.394261
## months10   1.470979 7.254746 -12.756698 1.470979   15.69866 1.470979
## months11   1.507559 7.254746 -12.720118 1.507559   15.73524 1.507559
## months12   1.780046 7.254742 -12.447624 1.780046   16.00772 1.780046
## Intercept1 2.151521 7.254754 -12.076172 2.151521   16.37922 2.151521
## Intercept2 0.670304 7.254866 -13.557609 0.670304   14.89822 0.670304
## Intercept3 2.067235 7.254771 -12.160491 2.067235   16.29496 2.067235
## Intercept4 4.288416 7.254741  -9.939252 4.288416   18.51608 4.288416
## Intercept5 3.661067 7.254751 -10.566620 3.661067   17.88875 3.661067
## Intercept6 1.859224 7.254757 -12.368475 1.859224   16.08692 1.859224
## Intercept7 5.003596 7.254724  -9.224038 5.003596   19.23123 5.003596
##                     kld
## months1    5.527841e-11
## months2    5.527829e-11
## months3    5.527827e-11
## months4    5.527824e-11
## months5    5.527835e-11
## months6    5.527847e-11
## months7    5.527836e-11
## months8    5.527836e-11
## months9    5.527848e-11
## months10   5.527836e-11
## months11   5.527836e-11
## months12   5.527841e-11
## Intercept1 5.527837e-11
## Intercept2 5.527828e-11
## Intercept3 5.527825e-11
## Intercept4 5.527843e-11
## Intercept5 5.527828e-11
## Intercept6 5.527846e-11
## Intercept7 5.527838e-11
kgr_model2_outfit$bri_hyperpar_summary
##                 mean         sd    q0.025      q0.5    q0.975      mode
## SD for id2 0.4337749 0.03419719 0.3698794 0.4325623 0.5041908 0.4302725
kgr_model2_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.196128   6.445225   6.596964   5.444985   5.028414   4.453624   4.292854 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.197280   4.031995   4.353494   4.515693   5.930131   8.597930   1.954831 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.902942  72.850953  38.902813   6.418751 148.947804
kgr_model2_outfit$K_EPA_weight
## [1] 0.9173235
kgr_model2_outfit$gfilter_weight
## [1] 0.08267646
#Show plots
kgr_model2_outfit$K_EPA_heatmap

kgr_model2_outfit$param_plot

kgr_model2_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model2_outfit$fitted_values)

We can also simplify the covariance of our underlying GP and see how our proposed model compares with a simplified version with a simple time kernel:

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{t} | \textbf{X} = exp(\beta_0 + F_{t})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,Ktime)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(t_1,t_2)\).

Instead of calculating gram matrix K based on covariate (EPA variables) similarity, our gram matrix K is simply a time kernel where similar values of t (months 1-72) have larger covariances. As a result, this model has no spatial dependence structure built in.

LGCP with temporal kernel (Reference model 3)

#Fit ref_model3 on one cluster (to test)
ref_model3_outfit = ref_model3(dataset = inla_outsample_data, cluster = 2, rho_time_rbf = 1,
                   rho_time_periodic = 1, sigma2_time = 5)

#Extract DIC and WAIC
ref_model3_DIC = ref_model3_outfit$model_DIC
ref_model3_WAIC = ref_model3_outfit$model_WAIC

#Get summaries of parameter estimates
ref_model3_outfit$model_summary
##                    mean          sd 0.025quant      0.5quant 0.975quant
## Intercept1 5.275922e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
## Intercept2 2.363643e+00  0.04200925   2.281251  2.363643e+00   2.446032
## Intercept3 5.275922e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
## Intercept4 5.275922e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
## Intercept5 5.275922e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
## Intercept6 5.275922e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
## Intercept7 5.275922e-15 31.62254149 -62.016688 -4.615441e-15  62.016688
##                mode          kld
## Intercept1 0.000000 5.527836e-11
## Intercept2 2.363643 3.296470e-11
## Intercept3 0.000000 5.527836e-11
## Intercept4 0.000000 5.527836e-11
## Intercept5 0.000000 5.527836e-11
## Intercept6 0.000000 5.527836e-11
## Intercept7 0.000000 5.527836e-11
ref_model3_outfit$bri_hyperpar_summary
##                    mean         sd      q0.025       q0.5     q0.975
## SD for time 0.009940187 0.00637071 0.003620911 0.00802757 0.02789661
##                    mode
## SD for time 0.005716609
ref_model3_outfit$exp_effects
## Intercept1 Intercept2 Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##     1.0000    10.6296     1.0000     1.0000     1.0000     1.0000     1.0000
ref_model3_outfit$K_time_weight
## [1] 0.9952043
ref_model3_outfit$gfilter_weight
## [1] 0.004795715
#Show plots
ref_model3_outfit$K_time_heatmap

ref_model3_outfit$param_plot

ref_model3_outfit$hyperparam_plot

test1 = ref_model3(dataset = inla_outsample_data, cluster = 1, rho_time_rbf = 214.194,
                   rho_time_periodic = 4.192, sigma2_time = 2.166)
test2 = ref_model3(dataset = inla_outsample_data, cluster = 2, rho_time_rbf = 301.021,
                   rho_time_periodic = 1.765, sigma2_time = 1.148)
test3 = ref_model3(dataset = inla_outsample_data, cluster = 3, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748)
test4 = ref_model3(dataset = inla_outsample_data, cluster = 4, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748) #why does sigma2 have to be so big here??
test5 = ref_model3(dataset = inla_outsample_data, cluster = 5, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748)
test6 = ref_model3(dataset = inla_outsample_data, cluster = 6, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748) #why does sigma2 have to be so big here??
test7 = ref_model3(dataset = inla_outsample_data, cluster = 7, rho_time_rbf = 54.805,
                   rho_time_periodic = 17.731, sigma2_time = 0.748)


ref_model3_outfvs = rbind(test1$fitted_values,test2$fitted_values,test3$fitted_values,
                       test4$fitted_values,test5$fitted_values,test6$fitted_values,test7$fitted_values)

pp_outsample_plot(num_plots = num_clus,ref_data = inla_full_data,pred_data = ref_model3_outfvs)

KGR model with temporal kernel x graph filter (Proposed model 1)

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{i,t} | \textbf{X} = exp(\beta_0 + F_{i,t})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,Ktime \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(t_1,t_2)(H^2)_{n_1,n_2}\).

#Fit kgr_model1
kgr_model1_outfit = kgr_model1(dataset = inla_outsample_data,rho_time_rbf = 557.614,rho_time_periodic = 545.753,sigma2_time = 4.581)

#Extract DIC and WAIC
kgr_model1_DIC = kgr_model1_outfit$model_DIC
kgr_model1_WAIC = kgr_model1_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model1_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0964716 7.254729 -12.131173 2.0964716   16.32412 2.0964716
## months2    1.8679041 7.254731 -12.359744 1.8679041   16.09555 1.8679041
## months3    1.8896231 7.254731 -12.338025 1.8896231   16.11727 1.8896231
## months4    1.6948633 7.254733 -12.532789 1.6948633   15.92252 1.6948633
## months5    1.6147969 7.254734 -12.612857 1.6147969   15.84245 1.6147969
## months6    1.4909953 7.254735 -12.736661 1.4909953   15.71865 1.4909953
## months7    1.4512718 7.254742 -12.776398 1.4512718   15.67894 1.4512718
## months8    1.4296532 7.254743 -12.798017 1.4296532   15.65732 1.4296532
## months9    1.3885959 7.254743 -12.839076 1.3885959   15.61627 1.3885959
## months10   1.4696237 7.254742 -12.758046 1.4696237   15.69729 1.4696237
## months11   1.5128407 7.254741 -12.714827 1.5128407   15.74051 1.5128407
## months12   1.7724364 7.254738 -12.455225 1.7724364   16.00010 1.7724364
## Intercept1 2.1494997 7.254771 -12.078227 2.1494997   16.37723 2.1494997
## Intercept2 0.6811812 7.254912 -13.546821 0.6811812   14.90918 0.6811812
## Intercept3 2.0674343 7.254806 -12.160361 2.0674343   16.29523 2.0674343
## Intercept4 4.2720236 7.254776  -9.955712 4.2720236   18.49976 4.2720236
## Intercept5 3.6433092 7.254796 -10.584466 3.6433092   17.87108 3.6433093
## Intercept6 1.8591192 7.254767 -12.368599 1.8591192   16.08684 1.8591192
## Intercept7 5.0065087 7.254734  -9.221145 5.0065087   19.23416 5.0065087
##                     kld
## months1    5.527847e-11
## months2    5.527830e-11
## months3    5.527830e-11
## months4    5.527842e-11
## months5    5.527841e-11
## months6    5.527838e-11
## months7    5.527827e-11
## months8    5.527827e-11
## months9    5.527839e-11
## months10   5.527842e-11
## months11   5.527843e-11
## months12   5.527848e-11
## Intercept1 5.527837e-11
## Intercept2 5.527839e-11
## Intercept3 5.527825e-11
## Intercept4 5.527834e-11
## Intercept5 5.527842e-11
## Intercept6 5.527844e-11
## Intercept7 5.527826e-11
kgr_model1_outfit$bri_hyperpar_summary
##                mean         sd    q0.025      q0.5   q0.975      mode
## SD for id2 0.452078 0.03534074 0.3859539 0.4508563 0.524762 0.4485373
kgr_model1_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.137407   6.474712   6.616874   5.445902   5.026867   4.441514   4.268540 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.177250   4.009217   4.347599   4.539608   5.885174   8.580565   1.976211 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.904516  71.666511  38.218101   6.418081 149.382292
kgr_model1_outfit$K_time_weight
## [1] 0.935156
kgr_model1_outfit$gfilter_weight
## [1] 0.06484397
#Show plots
kgr_model1_outfit$K_time_heatmap

kgr_model1_outfit$param_plot

kgr_model1_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model1_outfit$fitted_values)

Proposed model 3

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{it} | \textbf{X} = exp(\beta_0 + F_{it})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,(Ktime*K_{EPA}) \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = k(t_1,t_2)k(x_{t_1},x_{t_2})(H^2)_{n_1,n_2}\).

#Fit kgr_model3
kgr_model3_outfit = kgr_model3(dataset = inla_outsample_data, rho_EPA_rbf = 2871.019, rho_EPA_periodic = 2269.161,
                            rho_time_rbf = 3079.136, rho_time_periodic = 1060.033, sigma2 = 1.115, link=1)

#Extract DIC and WAIC
kgr_model3_DIC = kgr_model3_outfit$model_DIC
kgr_model3_WAIC = kgr_model3_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model3_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0994422 7.254731 -12.128205 2.0994422   16.32709 2.0994422
## months2    1.8659580 7.254731 -12.361691 1.8659580   16.09361 1.8659580
## months3    1.8895149 7.254732 -12.338135 1.8895149   16.11716 1.8895149
## months4    1.6972183 7.254734 -12.530435 1.6972183   15.92487 1.6972183
## months5    1.6172189 7.254735 -12.610437 1.6172189   15.84487 1.6172189
## months6    1.4940527 7.254736 -12.733605 1.4940527   15.72171 1.4940527
## months7    1.4571996 7.254743 -12.770472 1.4571996   15.68487 1.4571996
## months8    1.4349599 7.254743 -12.792712 1.4349599   15.66263 1.4349599
## months9    1.3943561 7.254744 -12.833317 1.3943561   15.62203 1.3943561
## months10   1.4719683 7.254743 -12.755703 1.4719683   15.69964 1.4719683
## months11   1.5095820 7.254743 -12.718089 1.5095820   15.73725 1.5095820
## months12   1.7764204 7.254739 -12.451244 1.7764204   16.00408 1.7764204
## Intercept1 2.1515482 7.254746 -12.076129 2.1515482   16.37923 2.1515482
## Intercept2 0.6767522 7.254842 -13.551113 0.6767522   14.90462 0.6767522
## Intercept3 2.0707971 7.254755 -12.156898 2.0707971   16.29849 2.0707971
## Intercept4 4.2865365 7.254730  -9.941109 4.2865365   18.51418 4.2865365
## Intercept5 3.6554563 7.254736 -10.572201 3.6554563   17.88311 3.6554563
## Intercept6 1.8599206 7.254752 -12.367768 1.8599206   16.08761 1.8599206
## Intercept7 5.0068802 7.254720  -9.220746 5.0068802   19.23451 5.0068802
##                     kld
## months1    5.527819e-11
## months2    5.527845e-11
## months3    5.527830e-11
## months4    5.527840e-11
## months5    5.527851e-11
## months6    5.527837e-11
## months7    5.527839e-11
## months8    5.527839e-11
## months9    5.527838e-11
## months10   5.527841e-11
## months11   5.527841e-11
## months12   5.527832e-11
## Intercept1 5.527834e-11
## Intercept2 5.527838e-11
## Intercept3 5.527835e-11
## Intercept4 5.527833e-11
## Intercept5 5.527826e-11
## Intercept6 5.527840e-11
## Intercept7 5.527852e-11
kgr_model3_outfit$bri_hyperpar_summary
##                 mean         sd    q0.025     q0.5    q0.975      mode
## SD for id2 0.6341904 0.04961669 0.5414159 0.632454 0.7362939 0.6291666
kgr_model3_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.161616   6.462124   6.616158   5.458742   5.039057   4.455114   4.293918 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.199477   4.032377   4.357804   4.524839   5.908668   8.598160   1.967477 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.931142  72.714183  38.685170   6.423227 149.437795
kgr_model3_outfit$K_weight
## [1] 0.6872526
kgr_model3_outfit$gfilter_weight
## [1] 0.3127474
#Show plots
kgr_model3_outfit$K_time_heatmap

kgr_model3_outfit$K_EPA_heatmap

kgr_model3_outfit$param_plot

kgr_model3_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model3_outfit$fitted_values)

Proposed model 4

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{it} | \textbf{X} = exp(\beta_0 + F_{it})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,(Ktime + K_{EPA}) \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = (k(t_1,t_2)+k(x_{t_1},x_{t_2}))(H^2)_{n_1,n_2}\).

#Fit kgr_model4
kgr_model4_outfit = kgr_model4(dataset = inla_outsample_data, rho_EPA_rbf = 446.638, rho_EPA_periodic = 413.201,
                            rho_time_rbf = 569.756, rho_time_periodic = 550.984, sigma2_EPA = 1.811, sigma2_time = 4.961, link = 1)

#Extract DIC and WAIC 
kgr_model4_DIC = kgr_model4_outfit$model_DIC
kgr_model4_WAIC = kgr_model4_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model4_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0957725 7.254730 -12.131874 2.0957725   16.32342 2.0957725
## months2    1.8665739 7.254732 -12.361076 1.8665739   16.09422 1.8665739
## months3    1.8882377 7.254732 -12.339413 1.8882377   16.11589 1.8882377
## months4    1.6936947 7.254734 -12.533959 1.6936947   15.92135 1.6936947
## months5    1.6150379 7.254735 -12.612618 1.6150379   15.84269 1.6150379
## months6    1.4918761 7.254736 -12.735782 1.4918761   15.71953 1.4918761
## months7    1.4522957 7.254743 -12.775377 1.4522957   15.67997 1.4522957
## months8    1.4306148 7.254743 -12.797057 1.4306148   15.65829 1.4306148
## months9    1.3893943 7.254745 -12.838281 1.3893943   15.61707 1.3893943
## months10   1.4696826 7.254743 -12.757989 1.4696826   15.69735 1.4696826
## months11   1.5130016 7.254743 -12.714669 1.5130016   15.74067 1.5130016
## months12   1.7720043 7.254739 -12.455659 1.7720043   15.99967 1.7720043
## Intercept1 2.1497218 7.254774 -12.078010 2.1497218   16.37745 2.1497218
## Intercept2 0.6795447 7.254919 -13.548472 0.6795447   14.90756 0.6795447
## Intercept3 2.0656920 7.254811 -12.162114 2.0656920   16.29350 2.0656920
## Intercept4 4.2727409 7.254780  -9.955003 4.2727409   18.50049 4.2727409
## Intercept5 3.6462699 7.254802 -10.581517 3.6462699   17.87406 3.6462699
## Intercept6 1.8590608 7.254768 -12.368660 1.8590608   16.08678 1.8590608
## Intercept7 5.0051559 7.254736  -9.222501 5.0051559   19.23281 5.0051559
##                     kld
## months1    5.527846e-11
## months2    5.527830e-11
## months3    5.527842e-11
## months4    5.527840e-11
## months5    5.527838e-11
## months6    5.527837e-11
## months7    5.527840e-11
## months8    5.527825e-11
## months9    5.527837e-11
## months10   5.527826e-11
## months11   5.527840e-11
## months12   5.527846e-11
## Intercept1 5.527833e-11
## Intercept2 5.527841e-11
## Intercept3 5.527831e-11
## Intercept4 5.527834e-11
## Intercept5 5.527829e-11
## Intercept6 5.527842e-11
## Intercept7 5.527851e-11
kgr_model4_outfit$bri_hyperpar_summary
##                 mean         sd    q0.025     q0.5   q0.975      mode
## SD for id2 0.6258224 0.04871945 0.5346796 0.624133 0.726037 0.6209287
kgr_model4_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.131721   6.466105   6.607714   5.439541   5.028078   4.445428   4.272913 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.181269   4.012419   4.347855   4.540339   5.882632   8.582470   1.972979 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.890757  71.717941  38.331421   6.417707 149.180338
kgr_model4_outfit$K_weight
## [1] 0.8913573
kgr_model4_outfit$gfilter_weight
## [1] 0.1086427
#Show plots
kgr_model4_outfit$K_time_heatmap

kgr_model4_outfit$K_EPA_heatmap

kgr_model4_outfit$param_plot

kgr_model4_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model4_outfit$fitted_values)

Proposed model 5

\(vec(Y) | \Lambda, \textbf{X} \sim Pois(vec(\Lambda))\) and \(\Lambda_{it} | \textbf{X} = exp(\beta_0 + F_{it})\)

where the graph signal \(\textbf{F} | \textbf{X} \sim GP(0,\frac{1}{3}(K_{AR} + K_{DL} + K_{Interaction}) \otimes H^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = (k(t_1,t_2)+k(x_{t_1},x_{t_2}))(H^2)_{n_1,n_2}\).

#Fit kgr_model5
kgr_model5_outfit = kgr_model5(dataset = inla_outsample_data, rho_AR_rbf = 0.003, rho_AR_periodic = 0.003,
                            rho_DL_rbf = 0.011, rho_DL_periodic = 0.013, rho_int_rbf = 0.007,
                            rho_int_periodic = 0.008, sigma2_AR = 2.281, sigma2_DL = 2.650, sigma2_int = 0.448, link=1)

#Extract DIC and WAIC 
kgr_model5_DIC = kgr_model5_outfit$model_DIC
kgr_model5_WAIC = kgr_model5_outfit$model_WAIC

#Get summaries of parameter estimates
kgr_model5_outfit$model_summary
##                 mean       sd 0.025quant  0.5quant 0.975quant      mode
## months1    2.0865646 7.254732 -12.141086 2.0865646   16.31421 2.0865646
## months2    1.8606007 7.254733 -12.367052 1.8606007   16.08825 1.8606007
## months3    1.8877047 7.254733 -12.339948 1.8877047   16.11536 1.8877047
## months4    1.6991779 7.254734 -12.528476 1.6991779   15.92683 1.6991779
## months5    1.6159844 7.254736 -12.611673 1.6159844   15.84364 1.6159844
## months6    1.4945327 7.254736 -12.733126 1.4945327   15.72219 1.4945327
## months7    1.4667381 7.254743 -12.760934 1.4667381   15.69441 1.4667381
## months8    1.4374114 7.254744 -12.790263 1.4374114   15.66509 1.4374114
## months9    1.4008301 7.254744 -12.826843 1.4008301   15.62850 1.4008301
## months10   1.4737520 7.254744 -12.753920 1.4737520   15.70142 1.4737520
## months11   1.5211576 7.254742 -12.706512 1.5211576   15.74883 1.5211576
## months12   1.7737113 7.254740 -12.453954 1.7737113   16.00138 1.7737113
## Intercept1 2.1518091 7.254746 -12.075868 2.1518091   16.37949 2.1518091
## Intercept2 0.6789171 7.254850 -13.548965 0.6789171   14.90680 0.6789171
## Intercept3 2.0715469 7.254760 -12.156159 2.0715469   16.29925 2.0715469
## Intercept4 4.2902855 7.254735  -9.937369 4.2902855   18.51794 4.2902855
## Intercept5 3.6573255 7.254744 -10.570348 3.6573255   17.88500 3.6573255
## Intercept6 1.8615872 7.254749 -12.366097 1.8615872   16.08927 1.8615872
## Intercept7 5.0066943 7.254718  -9.220928 5.0066943   19.23432 5.0066943
##                     kld
## months1    5.527844e-11
## months2    5.527827e-11
## months3    5.527841e-11
## months4    5.527827e-11
## months5    5.527837e-11
## months6    5.527837e-11
## months7    5.527826e-11
## months8    5.527824e-11
## months9    5.527838e-11
## months10   5.527839e-11
## months11   5.527828e-11
## months12   5.527831e-11
## Intercept1 5.527849e-11
## Intercept2 5.527838e-11
## Intercept3 5.527841e-11
## Intercept4 5.527829e-11
## Intercept5 5.527838e-11
## Intercept6 5.527830e-11
## Intercept7 5.527838e-11
kgr_model5_outfit$bri_hyperpar_summary
##               mean        sd   q0.025     q0.5   q0.975     mode
## SD for id2 1.60343 0.1181912 1.381791 1.599504 1.846087 1.592017
kgr_model5_outfit$exp_effects
##    months1    months2    months3    months4    months5    months6    months7 
##   8.057188   6.427597   6.604193   5.469449   5.032840   4.457253   4.335072 
##    months8    months9   months10   months11   months12 Intercept1 Intercept2 
##   4.209784   4.058568   4.365584   4.577521   5.892682   8.600403   1.971741 
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7 
##   7.937092  72.987303  38.757548   6.433941 149.410020
kgr_model5_outfit$K_weight
## [1] 0.5113507
kgr_model5_outfit$gfilter_weight
## [1] 0.4886493
#Show plots
kgr_model5_outfit$K_AR_heatmap

kgr_model5_outfit$K_DL_heatmap

kgr_model5_outfit$K_Interaction_heatmap

kgr_model5_outfit$prec_heatmap

kgr_model5_outfit$param_plot

kgr_model5_outfit$hyperparam_plot

pp_outsample_plot(pred_data = kgr_model5_outfit$fitted_values)

Comparing out of sample MAE, MASE, and MAPE for different clusters between models

print(degree_connectivity)
##   c(1:num_clus) node_connections
## 1             1                5
## 2             2                4
## 3             3                4
## 4             4                4
## 5             5                3
## 6             6                5
## 7             7                5

Mean absolute error (MAE) is a measure of the average size of the mistakes in a collection of predictions, without taking their direction into account

MAE = \(\frac{1}{h_{max}} \sum_{h=1}^{h_{max}} | \hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}|\) where \(\hat \lambda_{t+h}^{obs}\) is the average of the number of deaths observed for month \(t+h\) over all years

MAE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  
  actual_test = c()
  
  for (j in 7:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }

  pm_1_test = pm_1[55:60,]
  pm_2_test = pm_2[55:60,]
  pm_3_test = pm_3[55:60,]
  pm_4_test = pm_4[55:60,]
  pm_5_test = pm_5[55:60,]
  pm_6_test = pm_6[55:60,]
  pm_7_test = pm_7[55:60,] 

  actual_test_mean = mean(actual_test)

  mae1 = mean(abs(actual_test - pm_1_test$mean))
  mae2 = mean(abs(actual_test - pm_2_test$mean))
  mae3 = mean(abs(actual_test - pm_3_test$mean))
  mae4 = mean(abs(actual_test - pm_4_test$mean))
  mae5 = mean(abs(actual_test - pm_5_test$mean))
  mae6 = mean(abs(actual_test - pm_6_test$mean))
  mae7 = mean(abs(actual_test - pm_7_test$mean))

  MAE_table[,i] = c(mae1,mae2,mae3,mae4,mae5,mae6,mae7)
  # MAE_table[,i] = MAE_table[,i] / actual_test_mean
}

#Table 2: MAE on test dataset

MAE_table = data.frame(MAE_table)

colnames(MAE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
                         "Cluster 5","Cluster 6","Cluster 7")
rownames(MAE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

MAE_table
##                      Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model             2.592605  1.268546  1.409840  6.274060  5.752863
## LGCP model            2.820735  1.157376  4.255418 57.297519 36.129470
## Proposed KGR model 1  2.391592  1.285246  1.282672 13.678869  4.525800
## Proposed KGR model 2  2.705260  1.179156  1.425011  4.644662  5.663649
## Proposed KGR model 3  2.552046  1.209125  1.338967  5.543936  4.701508
## Proposed KGR model 4  2.379925  1.271811  1.306669 13.504196  4.639968
## Proposed KGR model 5  2.467047  1.240156  1.431675  6.672147  5.766521
##                      Cluster 6  Cluster 7
## BYM model             1.640120  10.269695
## LGCP model            4.566204 133.321685
## Proposed KGR model 1  1.602156  13.364808
## Proposed KGR model 2  1.736369  12.660282
## Proposed KGR model 3  1.683484  12.016977
## Proposed KGR model 4  1.599527  12.908752
## Proposed KGR model 5  1.580372   8.020954

Mean absolute scaled error (MASE) is a measure of the accuracy of forecasts. It is the mean absolute error of the forecast values, divided by the mean absolute error of the in-sample one-step naive forecast.

MASE = \(\frac{\frac{1}{n} \sum_{t=1}^n |\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}|}{\frac{1}{n-1} \sum_{t=2}^n |\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h-1}^{obs}|}\)

MASE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  
  actual_test = c()
  
  for (j in 6:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }

  pm_1_test = pm_1$mean[54:60]
  pm_2_test = pm_2$mean[54:60]
  pm_3_test = pm_3$mean[54:60]
  pm_4_test = pm_4$mean[54:60]
  pm_5_test = pm_5$mean[54:60]
  pm_6_test = pm_6$mean[54:60]
  pm_7_test = pm_7$mean[54:60] 

  actual_test_mean = mean(actual_test)
  
  values1 = c()
  values2 = c()
  values3 = c()
  values4 = c()
  values5 = c()
  values6 = c()
  values7 = c()
  
  for (j in 2:length(actual_test)){
    error1 = (abs(actual_test[j] - pm_1_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values1 = c(values1,error1)
    
    error2 = (abs(actual_test[j] - pm_2_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values2 = c(values2,error2)
    
    error3 = (abs(actual_test[j] - pm_3_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values3 = c(values3,error3)
    
    error4 = (abs(actual_test[j] - pm_4_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values4 = c(values4,error4)
    
    error5 = (abs(actual_test[j] - pm_5_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values5 = c(values5,error5)
    
    error6 = (abs(actual_test[j] - pm_6_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values6 = c(values6,error6)
    
    error7 = (abs(actual_test[j] - pm_7_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
    values7 = c(values7,error7)
  }
  
  mase1 = mean(values1)
  mase2 = mean(values2)
  mase3 = mean(values3)
  mase4 = mean(values4)
  mase5 = mean(values5)
  mase6 = mean(values6)
  mase7 = mean(values7)

  MASE_table[,i] = c(mase1,mase2,mase3,mase4,mase5,mase6,mase7)
  # MASE_table[,i] = MASE_table[,i] / actual_test_mean
}

#Table 3: MASE on test dataset

MASE_table = data.frame(MASE_table)

colnames(MASE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
                         "Cluster 5","Cluster 6","Cluster 7")
rownames(MASE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

MASE_table
##                       Cluster 1 Cluster 2  Cluster 3 Cluster 4 Cluster 5
## BYM model            0.11082889 0.1186339 0.10069063 0.1150280 0.3876792
## LGCP model           0.11756555 0.1131511 0.40823139 1.1039072 2.2455052
## Proposed KGR model 1 0.10046525 0.1217125 0.09296285 0.2391136 0.2045915
## Proposed KGR model 2 0.11464126 0.1136011 0.09476828 0.1022811 0.4299532
## Proposed KGR model 3 0.10824772 0.1145728 0.09293818 0.1038517 0.3636252
## Proposed KGR model 4 0.09992527 0.1204713 0.09559031 0.2328482 0.2222687
## Proposed KGR model 5 0.10589939 0.1153586 0.11467229 0.1257901 0.3869496
##                      Cluster 6  Cluster 7
## BYM model            0.4052325 0.05246631
## LGCP model           1.3247305 1.20582039
## Proposed KGR model 1 0.3324935 0.12271767
## Proposed KGR model 2 0.3921780 0.06959371
## Proposed KGR model 3 0.3852535 0.07689193
## Proposed KGR model 4 0.3336957 0.11705401
## Proposed KGR model 5 0.4088131 0.03643970

Mean absolute percentage error (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of a forecasting method in statistics, expressing accuracy as a ratio

MAPE = \(\frac{100}{n} \sum_{t=1}^n |\frac{\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}}{\hat \lambda_{t+h}^{obs}}|\)

MAPE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()

  actual_test = c()
  
  for (j in 7:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }
  
  pm_1_test = pm_1[55:60,]
  pm_2_test = pm_2[55:60,]
  pm_3_test = pm_3[55:60,]
  pm_4_test = pm_4[55:60,]
  pm_5_test = pm_5[55:60,]
  pm_6_test = pm_6[55:60,]
  pm_7_test = pm_7[55:60,] 

  actual_test_mean = mean(actual_test)

  mape1 = mean(abs((actual_test - pm_1_test$mean)/actual_test))
  mape2 = mean(abs((actual_test - pm_2_test$mean)/actual_test))
  mape3 = mean(abs((actual_test - pm_3_test$mean)/actual_test))
  mape4 = mean(abs((actual_test - pm_4_test$mean)/actual_test))
  mape5 = mean(abs((actual_test - pm_5_test$mean)/actual_test))
  mape6 = mean(abs((actual_test - pm_6_test$mean)/actual_test))
  mape7 = mean(abs((actual_test - pm_7_test$mean)/actual_test))

  MAPE_table[,i] = c(mape1,mape2,mape3,mape4,mape5,mape6,mape7)
  # MAPE_table[,i] = MAPE_table[,i] / actual_test_mean
}

#Table 2: RMSE on test dataset

MAPE_table = data.frame(MAPE_table)

colnames(MAPE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
                         "Cluster 5","Cluster 6","Cluster 7")
rownames(MAPE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

MAPE_table
##                       Cluster 1 Cluster 2  Cluster 3  Cluster 4  Cluster 5
## BYM model            0.06821122 0.1315388 0.03827485 0.01898377 0.03650007
## LGCP model           0.08062306 0.1389393 0.12125218 0.17909734 0.22277840
## Proposed KGR model 1 0.06233313 0.1336754 0.03484493 0.03896357 0.02634501
## Proposed KGR model 2 0.07004766 0.1219473 0.03799166 0.01412227 0.03466680
## Proposed KGR model 3 0.06648521 0.1252480 0.03611217 0.01643415 0.02972294
## Proposed KGR model 4 0.06198905 0.1322238 0.03559361 0.03844637 0.02712708
## Proposed KGR model 5 0.06530630 0.1293881 0.03953602 0.01993782 0.03611728
##                       Cluster 6  Cluster 7
## BYM model            0.06162323 0.01424395
## LGCP model           0.17583632 0.20608410
## Proposed KGR model 1 0.05988300 0.01990579
## Proposed KGR model 2 0.06429126 0.01783325
## Proposed KGR model 3 0.06285866 0.01716333
## Proposed KGR model 4 0.05984390 0.01927444
## Proposed KGR model 5 0.06007114 0.01128304

One way to compare performance between the models fit above is to calculate RMSEs for each model’s fit on each cluster’s time series. Since INLA makes predictions based on the posterior predictive distribution, I actually calculated two sets of RMSEs. The first one is the RMSE of the predictions made by each model on the observed training data points i.e. not months 67-72. These were the observations that the models were fit on so we would expect small discrepancies between the observed values and the posterior predictive means for those time periods. The second one is the RMSE of the predictions made by each model on the test data points i.e. months 67-72. There was a lot more variation in the RMSEs calculated for these data points obviously.

Another important thing to note here is that the RMSEs calculated for each cluster were drastically different because the population sizes between clusters varied by a lot (think thousands compared to hundred thousands). So in order to make actual comparisons, the RMSEs had to be scaled which involves dividing the calculated RMSE by the average of the actual observed data points. The resulting RMSE values for each cluster, which are presented in tables below, can now be interpreted relative to the average number of respiratory related deaths in that cluster.

#Overall fit
test_RMSE_table = matrix(nrow=7,ncol=num_clus)

for (i in 1:num_clus){
  actual = inla_full_data %>% filter(id == i)  %>% data.frame()

  pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
  pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()

  actual_test = c()
  
  for (j in 7:12){
    actual_j = actual %>% filter(months == j) %>% select(response)
    est_lambda = mean(actual_j$response)
    actual_test = c(actual_test,as.numeric(est_lambda))
  }
  
  pm_1_test = pm_1[55:60,]
  pm_2_test = pm_2[55:60,]
  pm_3_test = pm_3[55:60,]
  pm_4_test = pm_4[55:60,]
  pm_5_test = pm_5[55:60,]
  pm_6_test = pm_6[55:60,]
  pm_7_test = pm_7[55:60,] 

  actual_test_mean = mean(actual_test)

  test_rmse1 = sqrt(mean((actual_test - pm_1_test$mean)^2))
  test_rmse2 = sqrt(mean((actual_test - pm_2_test$mean)^2))
  test_rmse3 = sqrt(mean((actual_test - pm_3_test$mean)^2))
  test_rmse4 = sqrt(mean((actual_test - pm_4_test$mean)^2))
  test_rmse5 = sqrt(mean((actual_test - pm_5_test$mean)^2))
  test_rmse6 = sqrt(mean((actual_test - pm_6_test$mean)^2))
  test_rmse7 = sqrt(mean((actual_test - pm_7_test$mean)^2))
  
  test_RMSE_table[,i] = c(test_rmse1,test_rmse2,test_rmse3,test_rmse4,
                          test_rmse5,test_rmse6,test_rmse7)
  test_RMSE_table[,i] = test_RMSE_table[,i] / actual_test_mean
}

#Table 2: RMSE on test dataset

test_RMSE_table = data.frame(test_RMSE_table)

colnames(test_RMSE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
                         "Cluster 5","Cluster 6","Cluster 7")
rownames(test_RMSE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3",
                         "Proposed KGR model 4","Proposed KGR model 5")

test_RMSE_table
##                       Cluster 1 Cluster 2  Cluster 3  Cluster 4  Cluster 5
## BYM model            0.07719908 0.1524892 0.04377826 0.02141524 0.04257103
## LGCP model           0.09324095 0.1686702 0.12594268 0.17927525 0.21914815
## Proposed KGR model 1 0.06716602 0.1527392 0.04102194 0.04852544 0.03063204
## Proposed KGR model 2 0.07996183 0.1422506 0.05013457 0.01544742 0.03641501
## Proposed KGR model 3 0.07541062 0.1464809 0.04524665 0.01831876 0.03286640
## Proposed KGR model 4 0.06692333 0.1517193 0.04189698 0.04800514 0.03006517
## Proposed KGR model 5 0.07456315 0.1471362 0.04261221 0.02365569 0.04074229
##                       Cluster 6  Cluster 7
## BYM model            0.06721347 0.01757784
## LGCP model           0.18638357 0.21394782
## Proposed KGR model 1 0.06175152 0.02220596
## Proposed KGR model 2 0.06882503 0.02191786
## Proposed KGR model 3 0.06732400 0.01920160
## Proposed KGR model 4 0.06170245 0.02145837
## Proposed KGR model 5 0.06694294 0.01373751

Comparing coverage rates of credible intervals produced by each model

coverage = rep(0,7)
true_values = inla_full_data$response
models_fvs = list(ref_model2_outfit$fitted_values,ref_model3_outfvs,kgr_model1_outfit$fitted_values,
                  kgr_model2_outfit$fitted_values,kgr_model3_outfit$fitted_values,kgr_model4_outfit$fitted_values,kgr_model5_outfit$fitted_values)

for (i in 1:7){
  lci = models_fvs[[i]] %>% select('0.025quant')
  uci = models_fvs[[i]] %>% select('0.975quant')
  
  captured = (true_values >= lci$'0.025quant' & true_values <= uci$'0.975quant')
  coverage[i] = sum(captured)/length(captured)
}

coverage = data.frame(coverage)
colnames(coverage) = "95% coverage"
rownames(coverage) = c("BYM model","LGCP model","Proposed KGR model 1",
                         "Proposed KGR model 2","Proposed KGR model 3","Proposed KGR model 4","Proposed KGR model 5")
coverage
##                      95% coverage
## BYM model               0.3166667
## LGCP model              0.1285714
## Proposed KGR model 1    0.7880952
## Proposed KGR model 2    0.7928571
## Proposed KGR model 3    0.7928571
## Proposed KGR model 4    0.7857143
## Proposed KGR model 5    0.7928571

Plotting heatmaps of CA with estimated mean and variance of intensity function for each model

#Plot heatmap for time = 60
clusterlabels$counties = tolower(clusterlabels$counties)
colnames(clusterlabels) = c("subregion","cluster")
merged_response = join(ca_map,clusterlabels,by = "subregion")

true_values = inla_full_data %>% filter(time == 60) %>% select(id,response)
colnames(true_values) = c("cluster","response")
merged_response = join(merged_response,true_values,by = "cluster")

heatmap_limits = c(0,2000)
legend_titles = c("Ref model 2 fitted values","Ref model 3 fitted values","Prop model 1 fitted values",
                  "Prop model 2 fitted values","Prop model 3 fitted values","Prop model 4 fitted values")

#Plot of observed mortality
gg_pop <- ggplot() +
  geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = response), 
               color = "black") +
  coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
  scale_fill_viridis_c(limits = heatmap_limits, name = "Observed mortality") +
  theme_void() +
    labs(title = "Observed Mortality (Dec 2018) for CA",
         x = "Longitude",
         y = "Latitude")

print(gg_pop)

Plotting heatmaps of mean of intensity function

models_intensity_fvs = list(ref_model1_outfit$fitted_values,ref_model2_outfit$fitted_values,ref_model3_outfvs,
                            kgr_model1_outfit$fitted_values,kgr_model2_outfit$fitted_values,kgr_model3_outfit$fitted_values,
                            kgr_model4_outfit$fitted_values,kgr_model5_outfit$fitted_values)

merged_response = join(ca_map,clusterlabels,by = "subregion")

legend_titles = c("Ref model 1 fitted values","Ref model 2 fitted values","Ref model 3 fitted values",
                  "Prop model 1 fitted values","Prop model 2 fitted values","Prop model 3 fitted values",
                  "Prop model 4 fitted values","Prop model 5 fitted values")

for (i in 1:7){
  fitted_values = models_intensity_fvs[[i]] %>% filter(time == 60) %>% select(id,mean)
  heatmap_limits = c(0,1.5*max(fitted_values$mean))
  colname = sprintf("prediction.%s",i)
  colnames(fitted_values) = c("cluster",colname)
  merged_response = join(merged_response,fitted_values,by = "cluster")
  
  #Heatmap of each model's fvs 
  gg_pop <- ggplot() +
    geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = merged_response[,i+7]), 
                 color = "black") +
    coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
    scale_fill_viridis_c(limits = heatmap_limits, name = legend_titles[i]) +
    theme_void() +
      labs(title = "Predicted Mean (Dec 2019) for CA",
         x = "Longitude",
         y = "Latitude")
  
  print(gg_pop)
}

Plotting heatmaps of variance of intensity function

merged_response = join(ca_map,clusterlabels,by = "subregion")

for (j in 1:7){
  fitted_values = models_intensity_fvs[[j]] %>% filter(time == 60) %>% select(id,sd)
  fitted_values$sd = fitted_values$sd^2
  heatmap_limits = c(0,1.5*max(fitted_values$sd))
  colname = sprintf("prediction.%s",j)
  colnames(fitted_values) = c("cluster",colname)
  merged_response = join(merged_response,fitted_values,by = "cluster")
  
  #Heatmap of each model's fvs 
  gg_pop <- ggplot() +
    geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = merged_response[,j+7]), 
                 color = "black") +
    coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
    scale_fill_viridis_c(limits = heatmap_limits, name = legend_titles[j]) +
    theme_void() +
      labs(title = "Predicted Variance (Dec 2019) for CA",
         x = "Longitude",
         y = "Latitude")
  
  print(gg_pop)
}

Sliding timeframe forecasting exercise:

In this section, I implemented a forecasting exercise in which I start with 36 months of training data. I will use that data to estimate a model and then forecast 6 months ahead. Using months 13-36 AND the forecasted values, I will re-estimate the model and forecast again to get the next 6 months. This process continues until the original 36 months of data have been used to produce a complete time series of 60 months (the last 3 years are forecasted). This exercise gives us another way to compare the predictive ability of our various models.

ONE STEP AHEAD FORECASTING

Reference model

ref_model_error = cbind(MAE,MASE,MAPE,RMSPE)
ref_model_error
##             MAE        MASE       MAPE     RMSPE
##  [1,] 18.228571 0.020729671 0.08875661 25.497563
##  [2,]  7.600000 0.018738550 0.04932430 12.802678
##  [3,]  3.000000 0.026709075 0.04295158  3.796991
##  [4,]  3.400000 0.012414181 0.02372278  6.187083
##  [5,]  4.171429 0.066365751 0.03513482  5.475139
##  [6,]  2.571429 0.039829518 0.04001582  3.140519
##  [7,]  4.971429 0.084333436 0.04883699  6.975058
##  [8,]  3.628571 0.049233209 0.06536543  4.872664
##  [9,]  5.828571 0.092609032 0.08009851  8.024961
## [10,]  1.714286 0.042161765 0.03469257  2.140761
## [11,]  4.657143 0.080198043 0.06214016  5.826540
## [12,]  9.371429 0.004038253 0.05799603 17.473245
## [13,] 18.228571 0.013031877 0.08875661 25.497563
## [14,]  7.600000 0.015874065 0.04932430 12.802678
## [15,]  3.000000 0.027994560 0.04295158  3.796991
## [16,]  3.400000 0.012598345 0.02372278  6.187083
## [17,]  4.171429 0.066006542 0.03513482  5.475139
## [18,]  2.571429 0.038961132 0.04001582  3.140519
## [19,]  4.971429 0.083501151 0.04883699  6.975058
## [20,]  3.628571 0.047773676 0.06536543  4.872664
## [21,]  5.828571 0.091153982 0.08009851  8.024961
## [22,]  1.714286 0.041326319 0.03469257  2.140761
## [23,]  4.800000 0.079698886 0.06680869  5.853448
## [24,]  9.371429 0.003106388 0.05799603 17.473245
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5)
  print(post_pred_plot)
}

Proposed model

prop_model_error = cbind(MAE2,MASE2,MAPE2,RMSPE2)
prop_model_error
##            MAE2       MASE2      MAPE2    RMSPE2
##  [1,] 18.228571 0.020729671 0.08875661 25.497563
##  [2,]  7.600000 0.018738550 0.04932430 12.802678
##  [3,]  3.000000 0.026709075 0.04295158  3.796991
##  [4,]  3.400000 0.012414181 0.02372278  6.187083
##  [5,]  4.171429 0.066365751 0.03513482  5.475139
##  [6,]  2.571429 0.039829518 0.04001582  3.140519
##  [7,]  4.971429 0.084333436 0.04883699  6.975058
##  [8,]  3.628571 0.049233209 0.06536543  4.872664
##  [9,]  5.828571 0.092609032 0.08009851  8.024961
## [10,]  1.714286 0.042161765 0.03469257  2.140761
## [11,]  4.657143 0.080198043 0.06214016  5.826540
## [12,]  9.371429 0.004038253 0.05799603 17.473245
## [13,] 18.228571 0.013031877 0.08875661 25.497563
## [14,]  7.600000 0.015874065 0.04932430 12.802678
## [15,]  3.000000 0.027994560 0.04295158  3.796991
## [16,]  3.400000 0.012598345 0.02372278  6.187083
## [17,]  4.171429 0.066006542 0.03513482  5.475139
## [18,]  2.571429 0.038961132 0.04001582  3.140519
## [19,]  4.971429 0.083501151 0.04883699  6.975058
## [20,]  3.628571 0.047773676 0.06536543  4.872664
## [21,]  5.828571 0.091153982 0.08009851  8.024961
## [22,]  1.714286 0.041326319 0.03469257  2.140761
## [23,]  4.800000 0.079698886 0.06680869  5.853448
## [24,]  9.371429 0.003106388 0.05799603 17.473245
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5)
  print(post_pred_plot)
}

Grouped barplot for each error metric over 24 time points

# Create data frames for both tables
data1 <- data.frame(
  Time = 1:24,MAE,MASE,MAPE,RMSPE
)

data2 <- data.frame(
  Time = 1:24,MAE2,MASE2,MAPE2,RMSPE2
)
colnames(data2) = colnames(data1)

# Add a column to each data frame to indicate the source table
data1$Source <- 'Table 1'
data2$Source <- 'Table 2'

# Combine the two data frames
combined_data <- rbind(data1, data2)

# Melt the combined data to long format
data_long <- melt(combined_data, id.vars = c("Time", "Source"), variable.name = "Metric", value.name = "Value")

# Plot grouped bar charts using ggplot2
ggplot(data_long, aes(x = factor(Time), y = Value, fill = Source)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(x = "Time Points", y = "Error Values", title = "Error Metrics Over Time") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

6 month sliding window

Reference model iteration

starting_data = inla_full_data[1:252,]
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL

while(max(starting_data$time) < 60){
  
  ###Attach df for next 6 months with NAs in response
  end = nrow(starting_data)
  id = rep(1:7,6)
  id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+42)
  response = rep(NA,42)
  time = rep((starting_data$time[end]+1):(starting_data$time[end]+6),each=7)
  Intercept1 = rep(c(1,NA,NA,NA,NA,NA,NA),6)
  Intercept2 = rep(c(NA,1,NA,NA,NA,NA,NA),6)
  Intercept3 = rep(c(NA,NA,1,NA,NA,NA,NA),6)
  Intercept4 = rep(c(NA,NA,NA,1,NA,NA,NA),6)
  Intercept5 = rep(c(NA,NA,NA,NA,1,NA,NA),6)
  Intercept6 = rep(c(NA,NA,NA,NA,NA,1,NA),6)
  Intercept7 = rep(c(NA,NA,NA,NA,NA,NA,1),6)
  
  if (starting_data$months[end] == 6){
    months = rep(c(7,8,9,10,11,12),each=7)
  } else if (starting_data$months[end] == 12){
    months = rep(c(1,2,3,4,5,6),each=7)
  }
  
  new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2,Intercept3,
                        Intercept4,Intercept5,Intercept6,Intercept7)
  starting_data = rbind(starting_data,new_data)
  starting_data$months = factor(starting_data$months)
  
  ref_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id, model = "bym", graph = huge.est) 
  ref_model2 = inla(ref_formula2,family = "poisson",data = starting_data,
                  control.compute = list(dic=TRUE,waic=TRUE),
                  control.predictor = list(compute = TRUE, link = 1))
  
  ###Append ref model 2 predictions to starting data
  preds_ref_model2 = ref_model2$summary.fitted.values
  preds_ref_model2$mean = round(preds_ref_model2$mean)

  end2 = nrow(preds_ref_model2)

  pred_data = preds_ref_model2$mean[(end2-41):end2]
  starting_data$response[(end+1):(end+42)] = pred_data

  starting_data$months = as.numeric(starting_data$months)
}
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(sprintf("Sliding Timeframe Forecast for Cluster %1.0f",i))
  print(post_pred_plot)
}

Proposed model iteration

inv_covGP = kgr_model2_outfit$prec
inv_covGP3 = kgr_model3_outfit$prec
inv_covGP4 = kgr_model4_outfit$prec
inv_covGP5 = kgr_model5_outfit$prec

starting_data = inla_full_data[1:252,]
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL

while(max(starting_data$time) < 60){
  
  ###Attach df for next 6 months with NAs in response
  end = nrow(starting_data)
  id = rep(1:7,6)
  id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+42)
  response = rep(NA,42)
  time = rep((starting_data$time[end]+1):(starting_data$time[end]+6),each=7)
  Intercept1 = rep(c(1,NA,NA,NA,NA,NA,NA),6)
  Intercept2 = rep(c(NA,1,NA,NA,NA,NA,NA),6)
  Intercept3 = rep(c(NA,NA,1,NA,NA,NA,NA),6)
  Intercept4 = rep(c(NA,NA,NA,1,NA,NA,NA),6)
  Intercept5 = rep(c(NA,NA,NA,NA,1,NA,NA),6)
  Intercept6 = rep(c(NA,NA,NA,NA,NA,1,NA),6)
  Intercept7 = rep(c(NA,NA,NA,NA,NA,NA,1),6)
  
  if (starting_data$months[end] == 6){
    months = rep(c(7,8,9,10,11,12),each=7)
  } else if (starting_data$months[end] == 12){
    months = rep(c(1,2,3,4,5,6),each=7)
  }
  
  new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2,Intercept3,
                      Intercept4,Intercept5,Intercept6,Intercept7)
  starting_data = rbind(starting_data,new_data)
  starting_data$months = factor(starting_data$months)
  
  
  ###Fit KGR model on most recent 36 months
  starting_data_subset = starting_data[(nrow(starting_data)-293):end,]
  
  # #Proposed model 2
  # kgr_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP[c(starting_data_subset$id2),c(starting_data_subset$id2)])
  # 
  # kgr_model2 = inla(kgr_formula2, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model2$summary.fitted.values
  
  # #Proposed model 3
  # kgr_formula3 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP3[c(starting_data_subset$id2),c(starting_data_subset$id2)])
  # 
  # kgr_model3 = inla(kgr_formula3, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model3$summary.fitted.values
  
  # #Proposed model 4
  # kgr_formula4 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP4[c(starting_data_subset$id2),c(starting_data_subset$id2)])
  # 
  # kgr_model4 = inla(kgr_formula4, data = starting_data_subset, family = "poisson",
  #                   control.predictor = list(compute = TRUE, link = 1))
  # 
  # preds_kgr_model = kgr_model4$summary.fitted.values
  
  #Proposed model 5
  kgr_formula5 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP5[c(starting_data_subset$id2),c(starting_data_subset$id2)])

  kgr_model5 = inla(kgr_formula5, data = starting_data_subset, family = "poisson",
                    control.predictor = list(compute = TRUE, link = 1))

  preds_kgr_model = kgr_model5$summary.fitted.values
  
  
  ###Append KGR model predictions to starting data
  preds_kgr_model$mean = round(preds_kgr_model$mean)
  end2 = nrow(preds_kgr_model)

  pred_data = preds_kgr_model$mean[(end2-41):end2]
  starting_data$response[(end+1):(end+42)] = pred_data
  
  # starting_data$response = replace(starting_data$response,which(starting_data$response < 0),0)
  starting_data$months = as.numeric(starting_data$months)
}
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data

for (i in 1:num_clus){
  df = true_mortality %>% filter(id == i) %>% select(response)
  preds = starting_data %>% filter(id == i) 
  colnames(preds)[3] = "mean"
  df = cbind(df,preds)
  
  post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() + 
    geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5)
  print(post_pred_plot)
}